0%

Principles:沙盒底层原理

沙盒基础知识

在 CTF 的 pwn 题中一般有两种函数调用方式实现沙盒机制:

使用 prctl 系统调用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include<stdio.h>
#include<fcntl.h>
#include<unistd.h>
#include<stddef.h>
#include<linux/seccomp.h>
#include<linux/filter.h>
#include<sys/prctl.h>
#include<linux/bpf.h>
#include<sys/types.h>

int main(){
struct sock_filter filter[]={
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, 0),
BPF_JUMP(BPF_JMP|BPF_JEQ, 59, 1, 0),
BPF_JUMP(BPF_JMP|BPF_JGE, 0, 1, 0),
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_ERRNO),
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_ALLOW),
};
struct sock_fprog prog={
.len=sizeof(filter)/sizeof(filter[0]),
.filter=filter,
};
prctl(PR_SET_NO_NEW_PRIVS,1,0,0,0);
prctl(PR_SET_SECCOMP,SECCOMP_MODE_FILTER,&prog);
syscall(59,"/bin/sh",NULL,NULL);
return 0;
}
1
2
3
4
5
0000: 0x20 0x00 0x00 0x00000000  A = sys_number
0001: 0x15 0x01 0x00 0x0000003b if (A == execve) goto 0003
0002: 0x35 0x01 0x00 0x00000000 if (A >= 0x0) goto 0004
0003: 0x06 0x00 0x00 0x00050000 return ERRNO(0)
0004: 0x06 0x00 0x00 0x7fff0000 return ALLOW

使用 seccomp 库函数:

1
2
3
4
5
6
7
8
9
10
11
12
#include <unistd.h>
#include <seccomp.h>
#include <linux/seccomp.h>

int main(void){
scmp_filter_ctx ctx;
ctx = seccomp_init(SCMP_ACT_ALLOW);
seccomp_rule_add(ctx, SCMP_ACT_KILL, SCMP_SYS(execve), 0);
seccomp_load(ctx);
syscall(59,"/bin/sh",NULL,NULL);
return 0;
}
1
2
3
4
5
6
7
8
0000: 0x20 0x00 0x00 0x00000004  A = arch
0001: 0x15 0x00 0x05 0xc000003e if (A != ARCH_X86_64) goto 0007
0002: 0x20 0x00 0x00 0x00000000 A = sys_number
0003: 0x35 0x00 0x01 0x40000000 if (A < 0x40000000) goto 0005
0004: 0x15 0x00 0x02 0xffffffff if (A != 0xffffffff) goto 0007
0005: 0x15 0x01 0x00 0x0000003b if (A == execve) goto 0007
0006: 0x06 0x00 0x00 0x7fff0000 return ALLOW
0007: 0x06 0x00 0x00 0x00000000 return KILL
  • seccomp_load 函数进行逆向分析,可以发现其底层也是使用 prctl 系统调用
1
v17 = prctl(38LL, 1LL, 0LL, 0LL, 0LL); /* PR_SET_NO_NEW_PRIVS */
1
v14 = prctl(22LL, 2LL, v10, v10, v7); /* PR_SET_SECCOMP */

prctl 系统调用

prctl(Process Control Language,进程控制语言)是一个 Linux 系统调用的一个重要工具,它可以对进程进行各种管理和控制操作

prctl 提供了对进程的许多控制和设置,使用第一个参数来指定其功能:

  • 设置进程的权限级别
  • 设置进程的调度参数
  • 设置进程的内存限制
  • 设置进程的 CPU 时间限制
  • 设置进程的信号处理
  • 设置进程的资源限制
  • 设置进程的属性
  • 获取进程的属性

沙盒需要的 prctl 功能如下:

  • prctl(PR_SET_NO_NEW_PRIVS):命名空间内以 CAP_SYS_ADMIN 权限运行(子进程会保证不会赋予运行进程新的权限)
  • prctl(PR_SET_SECCOMP):第二个参数是设置的过滤模式,第三个参数是设置的过滤规则
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
struct task_struct *me = current;
unsigned char comm[sizeof(me->comm)];
long error;

error = security_task_prctl(option, arg2, arg3, arg4, arg5);
if (error != -ENOSYS)
return error;

error = 0;
switch (option) {
......

case PR_SET_SECCOMP:
error = prctl_set_seccomp(arg2, (char __user *)arg3);
break;

......

case PR_SET_NO_NEW_PRIVS:
if (arg2 != 1 || arg3 || arg4 || arg5)
return -EINVAL;

task_set_no_new_privs(current);
break;

......

default:
error = -EINVAL;
break;
}
return error;
}

核心函数 prctl_set_seccomp 的调用链如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
{
unsigned int op;
void __user *uargs;

switch (seccomp_mode) {
case SECCOMP_MODE_STRICT: /* 严格模式(所有的syscall都被检查和过滤) */
op = SECCOMP_SET_MODE_STRICT;
uargs = NULL;
break;
case SECCOMP_MODE_FILTER: /* 过滤模式(所有的syscall都被允许,但是某些syscall可能会被过滤器拒绝) */
op = SECCOMP_SET_MODE_FILTER;
uargs = filter;
break;
default:
return -EINVAL;
}

return do_seccomp(op, 0, uargs);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static long do_seccomp(unsigned int op, unsigned int flags,
void __user *uargs)
{
switch (op) {
case SECCOMP_SET_MODE_STRICT: /* 严格模式 */
if (flags != 0 || uargs != NULL)
return -EINVAL;
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER: /* 过滤模式 */
return seccomp_set_mode_filter(flags, uargs);
case SECCOMP_GET_ACTION_AVAIL: /* 用于查询特定的action是否被内核支持 */
if (flags != 0)
return -EINVAL;

return seccomp_get_action_avail(uargs);
case SECCOMP_GET_NOTIF_SIZES: /* 获取指定进程的安全上下文通知大小 */
if (flags != 0)
return -EINVAL;

return seccomp_get_notif_sizes(uargs);
default:
return -EINVAL;
}
}

这里我们重点分析过滤模式的 seccomp_set_mode_filter 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
static long seccomp_set_mode_filter(unsigned int flags,
const char __user *filter)
{
const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
int listener = -1;
struct file *listener_f = NULL;

......

prepared = seccomp_prepare_user_filter(filter); /* 在持有锁之前准备新过滤器 */
if (IS_ERR(prepared))
return PTR_ERR(prepared);

if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
listener = get_unused_fd_flags(O_CLOEXEC);
if (listener < 0) {
ret = listener;
goto out_free;
}

listener_f = init_listener(prepared); /* 初始化一个监听器,用于接收来自内核的通知和事件 */
if (IS_ERR(listener_f)) {
put_unused_fd(listener);
ret = PTR_ERR(listener_f);
goto out_free;
}
}

if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
mutex_lock_killable(&current->signal->cred_guard_mutex))
goto out_put_fd;

spin_lock_irq(&current->sighand->siglock);

if (!seccomp_may_assign_mode(seccomp_mode))
goto out;

if (has_duplicate_listener(prepared)) { /* 检查一个进程是否已经有一个监听器 */
ret = -EBUSY;
goto out;
}

ret = seccomp_attach_filter(flags, prepared); /* 将一个过滤器附加到受限制的安全上下文中 */
if (ret)
goto out;
/* Do not free the successfully attached filter. */
prepared = NULL;

seccomp_assign_mode(current, seccomp_mode, flags); /* 将一个受限制的安全上下文分配给一个进程(current当前进程) */
out:
spin_unlock_irq(&current->sighand->siglock);
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
mutex_unlock(&current->signal->cred_guard_mutex);
out_put_fd:
if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
if (ret) {
listener_f->private_data = NULL;
fput(listener_f); /* 释放对文件的最后一个引用 */
put_unused_fd(listener); /* 说明目标文件描述符已经不再使用 */
seccomp_notify_detach(prepared);
} else {
fd_install(listener, listener_f);
ret = listener;
}
}
out_free:
seccomp_filter_free(prepared);
return ret;
}
  • 其最核心的工作就是在 current->seccomp.filter 中注册过滤器:
1
2
3
filter->prev = current->seccomp.filter;
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);

如果使用了 FILTER 模式,则调用 seccomp_run_filters 函数来进行所有指令判断过滤,系统调用号作为参数传递,根据返回值来进行后续处理

这里我们分析一下从 syscall 入口函数 entry_SYSCALL_compatseccomp_run_filters 的调用链:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
SYM_CODE_START(entry_SYSCALL_compat)
UNWIND_HINT_EMPTY
/* Interrupts are off on entry. */
swapgs

/* Stash user ESP */
movl %esp, %r8d

/* Use %rsp as scratch reg. User ESP is stashed in r8 */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
......
movq %rsp, %rdi
call do_fast_syscall_32
......
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9

xorl %r8d, %r8d
xorl %r9d, %r9d
xorl %r10d, %r10d
swapgs
sysretl
SYM_CODE_END(entry_SYSCALL_compat)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
unsigned int nr = syscall_32_enter(regs);
int res;

......

/* The case truncates any ptrace induced syscall nr > 2^32 -1 */
nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);

/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs, nr);
syscall_exit_to_user_mode(regs);
return true;
}
1
2
3
4
long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
{
return __syscall_enter_from_user_work(regs, syscall);
}
1
2
3
4
5
6
7
8
9
10
11
static __always_inline long
__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
unsigned long ti_work;

ti_work = READ_ONCE(current_thread_info()->flags);
if (ti_work & SYSCALL_ENTER_WORK)
syscall = syscall_trace_enter(regs, syscall, ti_work);

return syscall;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static long syscall_trace_enter(struct pt_regs *regs, long syscall,
unsigned long ti_work)
{
long ret = 0;

......

if (ti_work & _TIF_SECCOMP) {
ret = __secure_computing(NULL);
if (ret == -1L)
return ret;
}

......

return ret ? : syscall;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int __secure_computing(const struct seccomp_data *sd)
{
int mode = current->seccomp.mode;
int this_syscall;

if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return 0;

this_syscall =
sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current));

switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
return 0;
case SECCOMP_MODE_FILTER:
return __seccomp_filter(this_syscall, sd, false);
default:
BUG();
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
struct seccomp_filter *match = NULL;
int data;
struct seccomp_data sd_local;

rmb();

if (!sd) {
populate_seccomp_data(&sd_local);
sd = &sd_local;
}

filter_ret = seccomp_run_filters(sd, &match);

......

skip:
seccomp_log(this_syscall, 0, action, match ? match->log : false);
return -1;
}

指令过滤函数 seccomp_run_filters 的源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static u32 seccomp_run_filters(const struct seccomp_data *sd,
struct seccomp_filter **match)
{
u32 ret = SECCOMP_RET_ALLOW;
/* Make sure cross-thread synced filter points somewhere sane. */
struct seccomp_filter *f = READ_ONCE(current->seccomp.filter);

/* Ensure unexpected behavior doesn't result in failing open. */
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;

/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd); /* 用于运行BPF程序的函数 */

if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret;
*match = f;
}
}
return ret;
}
  • bpf_prog_run_pin_on_cpu 是一个用于运行 BPF 程序的函数,该函数将 BPF 程序加载到指定 CPU 的内存中,并将其附加到指定 CPU 的运行队列中
1
2
3
4
5
6
7
8
9
10
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
const void *ctx)
{
u32 ret;

migrate_disable(); /* 禁用内核进程迁移 */
ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func); /* 运行BPF程序 */
migrate_enable(); /* 启用内核进程迁移 */
return ret;
}

eBPF 虚拟机

Linux 下 eBPF 的整体架构如下图所示:

  • 传入:用户进程首先在用户空间编写相应的 BPF 字节码程序,传入内核
  • 检查:内核通过 verifier 对字节码程序进行安全性检查
  • 编译 or 解释:通过检查后便通过 JIT 编译运行,或者直接解释运行 BPF 字节码
  • 映射:用以保存数据的通用结构,可以在不同的 eBPF 程序之间或是用户进程与内核间共享数据(不同的 eBPF 程序之间可以共享同一个映射)

eBPF 底层是一个使用 RISC 指令集的虚拟机,使用11个64位寄存器和一个固定大小为512字节的栈:

  • 其中9个寄存器是通用寄存器
  • 一个只读栈帧寄存器

寄存器总是64位大小,在32位机器上会默认把前32位置零,这也为 eBPF 提供了交叉编译的兼容性,各个寄存器的功能如下:

  • R0: RAX,存放函数返回值或程序退出状态码
  • R1: RDI,第一个实参
  • R2: RSI,第二个实参
  • R3: RDX,第三个实参
  • R4: RCX,第四个实参
  • R5: R8,第五个实参
  • R6: RBX,callee saved
  • R7: R13,callee saved
  • R8: R14,callee saved
  • R9: R15,callee saved
  • R10: RBP,只读栈帧

在 eBPF 中,一个寄存器的状态信息使用 bpf_reg_state 进行表示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
struct bpf_reg_state {
/* Ordering of fields matters. See states_equal() */
enum bpf_reg_type type; /* 记录寄存器类型 */
/* Fixed part of pointer offset, pointer types only */
s32 off;
union {
/* valid when type == PTR_TO_PACKET */
int range;

/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
* PTR_TO_MAP_VALUE_OR_NULL
*/
struct bpf_map *map_ptr;

/* for PTR_TO_BTF_ID */
struct {
struct btf *btf;
u32 btf_id;
};

u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */

/* Max size from any of the above. */
struct {
unsigned long raw1;
unsigned long raw2;
} raw;
};
/* For PTR_TO_PACKET, used to find other pointers with the same variable
* offset, so they can share range knowledge.
* For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
* came from, when one is tested for != NULL.
* For PTR_TO_MEM_OR_NULL this is used to identify memory allocation
* for the purpose of tracking that it's freed.
* For PTR_TO_SOCKET this is used to share which pointers retain the
* same reference to the socket, to determine proper reference freeing.
*/
u32 id;
/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
* from a pointer-cast helper, bpf_sk_fullsock() and
* bpf_tcp_sock().
*
* Consider the following where "sk" is a reference counted
* pointer returned from "sk = bpf_sk_lookup_tcp();":
*
* 1: sk = bpf_sk_lookup_tcp();
* 2: if (!sk) { return 0; }
* 3: fullsock = bpf_sk_fullsock(sk);
* 4: if (!fullsock) { bpf_sk_release(sk); return 0; }
* 5: tp = bpf_tcp_sock(fullsock);
* 6: if (!tp) { bpf_sk_release(sk); return 0; }
* 7: bpf_sk_release(sk);
* 8: snd_cwnd = tp->snd_cwnd; // verifier will complain
*
* After bpf_sk_release(sk) at line 7, both "fullsock" ptr and
* "tp" ptr should be invalidated also. In order to do that,
* the reg holding "fullsock" and "sk" need to remember
* the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id
* such that the verifier can reset all regs which have
* ref_obj_id matching the sk_reg->id.
*
* sk_reg->ref_obj_id is set to sk_reg->id at line 1.
* sk_reg->id will stay as NULL-marking purpose only.
* After NULL-marking is done, sk_reg->id can be reset to 0.
*
* After "fullsock = bpf_sk_fullsock(sk);" at line 3,
* fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id.
*
* After "tp = bpf_tcp_sock(fullsock);" at line 5,
* tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id
* which is the same as sk_reg->ref_obj_id.
*
* From the verifier perspective, if sk, fullsock and tp
* are not NULL, they are the same ptr with different
* reg->type. In particular, bpf_sk_release(tp) is also
* allowed and has the same effect as bpf_sk_release(sk).
*/
u32 ref_obj_id;
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
* the actual value.
* For pointer types, this represents the variable part of the offset
* from the pointed-to object, and is shared with all bpf_reg_states
* with the same id as us.
*/
struct tnum var_off;
/* Used to determine if any memory access using this register will
* result in a bad access.
* These refer to the same value as var_off, not necessarily the actual
* contents of the register.
*/
s64 smin_value; /* minimum possible (s64)value */
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
s32 s32_min_value; /* minimum possible (s32)value */
s32 s32_max_value; /* maximum possible (s32)value */
u32 u32_min_value; /* minimum possible (u32)value */
u32 u32_max_value; /* maximum possible (u32)value */
/* parentage chain for liveness checking */
struct bpf_reg_state *parent;
/* Inside the callee two registers can be both PTR_TO_STACK like
* R1=fp-8 and R2=fp-8, but one of them points to this function stack
* while another to the caller's stack. To differentiate them 'frameno'
* is used which is an index in bpf_verifier_state->frame[] array
* pointing to bpf_func_state.
*/
u32 frameno;
/* Tracks subreg definition. The stored value is the insn_idx of the
* writing insn. This is safe because subreg_def is used before any insn
* patching which only happens after main verification finished.
*/
s32 subreg_def;
enum bpf_reg_liveness live;
/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
bool precise;
};

当 eBPF 字节码载入到内核中后,verifier 会对 eBPF 字节码进行一系列的检查,主要关注以下几个字段:

  • smin_valuesmax_value:64 位有符号的值的可能取值边界
  • umin_valueumax_value:64 位无符号的值的可能取值边界
  • s32_min_values32_max_value:32 位有符号的值的可能取值边界
  • u32_min_valueu32_max_value:32 位无符号的值的可能取值边界

核心检查函数就是 bpf_check,一个静态代码分析器:

  • 逐条遍历 eBPF 程序中的指令并更新寄存器 / 堆栈的状态,条件分支的所有路径都会被分析,直到 bpf_exit 指令
  • 这其实是一个模拟执行的过程,verifier 会推测寄存器的边界值,检查其是否符合规则
  • 模拟通过后才能正常加载 eBPF 程序

在其中用于检测指令合法性的函数为 do_check,该函数会遍历每一条指令并根据指令的不同类型进行不同操作,对于算术指令(BPF_ALU / BPF_ALU64)而言有如下调用链

1
2
3
4
do_check()        					// 遍历每一条指令并根据类型调用相应函数处理
->check_alu_op() // 根据算术指令的opcode进行不同处理
->adjust_reg_min_max_vals() // 计算新的寄存器边界值
->adjust_scalar_min_max_vals() // 根据opcode计算具体的新边界值

当 eBPF 字节码载入到内核中后,内核最终会使用一个 bpf_prog 结构体来表示一个 eBPF 程序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
u32 jited_len; /* Size of jited insns in bytes */
u8 tag[BPF_TAG_SIZE];
struct bpf_prog_aux *aux; /* Auxiliary fields */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
unsigned int (*bpf_func)(const void *ctx,
const struct bpf_insn *insn);
/* Instructions for interpreter */
struct sock_filter insns[0];
struct bpf_insn insnsi[];
};

接着就是编译,解释 eBPF 字节码,生成 bpf map 并记录在 bpf_reg_state->map_ptr

bpf map 是一个通用的用以储存不同种类数据的结构,用以在用户进程与 eBPF 程序、eBPF 程序与 eBPF 程序之间进行数据共享(这些数据以二进制形式储存,因此用户在创建时只需要指定 key 与 value 的 size)

核心结构体 bpf_map 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
struct bpf_map {
/* The first two cachelines with read-mostly members of which some
* are also accessed in fast-path (e.g. ops, max_entries).
*/
const struct bpf_map_ops *ops ____cacheline_aligned;
struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
void *security;
#endif
enum bpf_map_type map_type; /* map的数据结构类型 */
u32 key_size; /* 以字节为单位的用以索引一个元素的key的size(在数组映射中使用) */
u32 value_size; /* 以字节为单位的每个元素的size */
u32 max_entries; /* map中entries的最大数量 */
u32 map_flags; /* 描述map的独特特征(例如是否整个map的内存应被预先分配等) */
int spin_lock_off; /* >=0 valid offset, <0 error */
u32 id;
int numa_node;
u32 btf_key_type_id;
u32 btf_value_type_id;
struct btf *btf;
#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg;
#endif
char name[BPF_OBJ_NAME_LEN];
u32 btf_vmlinux_value_type_id;
bool bypass_spec_v1;
bool frozen; /* write-once; write-protected by freeze_mutex */
/* 22 bytes hole */

/* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting.
*/
atomic64_t refcnt ____cacheline_aligned;
atomic64_t usercnt;
struct work_struct work;
struct mutex freeze_mutex;
u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
};

bpf map 有多种类型,记录于 bpf_map_type 枚举中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
BPF_MAP_TYPE_HASH, /* 以哈希表形式存储键值对(最常见) */
BPF_MAP_TYPE_ARRAY, /* 以数组形式存储键值对,key即为数组下标,value初始化为'0' */
BPF_MAP_TYPE_PROG_ARRAY, /* 特殊的数组映射,value为其他eBPF程序的文件描述符 */
BPF_MAP_TYPE_PERF_EVENT_ARRAY,
BPF_MAP_TYPE_PERCPU_HASH,
BPF_MAP_TYPE_PERCPU_ARRAY,
BPF_MAP_TYPE_STACK_TRACE,
BPF_MAP_TYPE_CGROUP_ARRAY,
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
BPF_MAP_TYPE_LPM_TRIE,
BPF_MAP_TYPE_ARRAY_OF_MAPS,
BPF_MAP_TYPE_HASH_OF_MAPS,
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
BPF_MAP_TYPE_QUEUE,
BPF_MAP_TYPE_STACK, /* 以栈形式存储数据 */
BPF_MAP_TYPE_SK_STORAGE,
BPF_MAP_TYPE_DEVMAP_HASH,
BPF_MAP_TYPE_STRUCT_OPS,
BPF_MAP_TYPE_RINGBUF,
BPF_MAP_TYPE_INODE_STORAGE,
BPF_MAP_TYPE_TASK_STORAGE,
};

Seccomp BPF

柏克莱封包过滤器(Berkeley Packet Filter,缩写 BPF),是类 Unix 系统上数据链路层的一种原始接口,提供原始链路层封包的收发,除此之外,如果网卡驱动支持洪泛模式,那么它可以让网卡处于此种模式,这样可以收到网络上的所有包,不管他们的目的地是不是所在主机

Seccomp BPF 是一种基于 Linux 内核的 BPF 过滤器,用于对 Linux 进程的系统调用进行过滤和拦截(eBPF 的一部分)

BPF 的指令集比较简单,开发人员定义了符号常量和两个方便的宏 BPF_STMTBPF_JUMP 可以用来方便的编写 BPF 规则

1
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,(offsetof(struct seccomp_data, arch)))
  • BPF_LD:建一个 BPF 加载操作
  • BPF_W:操作数大小是一个字
  • BPF_ABS:使用绝对偏移,即使用指令中的值作为数据区的偏移量,该值是体系结构字段与数据区域的偏移量
  • offsetof():生成数据区域中期望字段的偏移量
1
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K ,AUDIT_ARCH_X86_64 , 1, 0)
  • BPF_JMP | BPF JEQ 会创建一个相等跳转指令,它将指令中的值(即第二个参数)与累加器中的值(BPF_K)进行比较,判断是否相等
    • 如果架构是则跳过下一条指令(jt=1,代表测试为真,跳过一条指令)
    • 否则将执行下一条指令(jf=0,代表测试为假,继续执行下一条指令)

用户编写的 eBPF 程序最终会被编译成 eBPF 字节码,eBPF 字节码使用 bpf_insn 结构来表示,如下:

1
2
3
4
5
6
7
struct bpf_insn {
__u8 code; /* 操作码 */
__u8 dst_reg:4; /* 目标寄存器 */
__u8 src_reg:4; /* 源寄存器 */
__s16 off; /* 偏移量 */
__s32 imm; /* 立即操作数 */
};

eBPF 程序会被 LLVM/Clang 编译成 bpf_insn 结构数组,这里使用了 JIT 即时编译技术(PS:当 eBPF 字节码被加载到内核时,内核会根据是否开启了 JIT 功能选项,来决定是否将 eBPF 字节码编译成机器码)

内核通过 bpf_prog_load 函数来加载 eBPF 字节码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog; /* 保存eBPF程序的信息 */
int err;
char license[128];
bool is_gpl;

......

/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); /* 初始化bpf_prog结构体 */
if (!prog)
return -ENOMEM;

......

/* run eBPF verifier */
err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;

prog = bpf_prog_select_runtime(prog, &err); /* 判断并使用jit进行编译 */
if (err < 0)
goto free_used_maps;

err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;

bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_LOAD);

err = bpf_prog_new_fd(prog);
if (err < 0)
bpf_prog_put(prog);
return err;

free_used_maps:
/* In case we have subprogs, we need to wait for a grace
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
__bpf_prog_put_noref(prog, prog->aux->func_cnt);
return err;
free_prog:
bpf_prog_uncharge_memlock(prog);
free_prog_sec:
security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
  • 函数 bpf_prog_load 会调用 bpf_prog_select_runtime 函数来判断是否使用 JIT
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
if (fp->bpf_func)
goto finalize;

bpf_prog_select_func(fp);

if (!bpf_prog_is_dev_bound(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;

fp = bpf_int_jit_compile(fp); /* 判断是否需要将eBPF字节码编译成机器码 */
if (!fp->jited) {
bpf_prog_free_jited_linfo(fp);
#ifdef CONFIG_BPF_JIT_ALWAYS_ON
*err = -ENOTSUPP;
return fp;
#endif
} else {
bpf_prog_free_unused_jited_linfo(fp);
}
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
return fp;
}

finalize:
bpf_prog_lock_ro(fp);

*err = bpf_check_tail_call(fp);

return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
  • 对于不同的架构,函数 bpf_int_jit_compile 有不同的实现
  • 这里我们只分析 x86_64 架构下的 bpf_int_jit_compile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
struct x64_jit_data *jit_data;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
bool tmp_blinded = false;
bool extra_pass = false;
u8 *image = NULL;
int *addrs;
int pass;
int i;

if (!prog->jit_requested) /* 判断是否支持jit */
return orig_prog;

tmp = bpf_jit_blind_constants(prog);
/*
* If blinding was requested and we failed during blinding,
* we must fall back to the interpreter.
*/
if (IS_ERR(tmp))
return orig_prog;
if (tmp != prog) {
tmp_blinded = true;
prog = tmp;
}

jit_data = prog->aux->jit_data;
if (!jit_data) {
jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
if (!jit_data) {
prog = orig_prog;
goto out;
}
prog->aux->jit_data = jit_data;
}
addrs = jit_data->addrs;
if (addrs) {
ctx = jit_data->ctx;
oldproglen = jit_data->proglen;
image = jit_data->image;
header = jit_data->header;
extra_pass = true;
goto skip_init_addrs;
}
addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
if (!addrs) {
prog = orig_prog;
goto out_addrs;
}

/*
* Before first pass, make a rough estimation of addrs[]
* each BPF instruction is translated to less than 64 bytes
*/
for (proglen = 0, i = 0; i <= prog->len; i++) {
proglen += 64;
addrs[i] = proglen;
}
ctx.cleanup_addr = proglen;
skip_init_addrs:

/*
* JITed image shrinks with every pass and the loop iterates
* until the image stops shrinking. Very large BPF programs
* may converge on the last pass. In such case do one more
* pass to emit the final image.
*/
for (pass = 0; pass < 20 || image; pass++) {
proglen = do_jit(prog, addrs, image, oldproglen, &ctx); /* 将eBPF字节码编译成本地机器码 */
if (proglen <= 0) {
out_image:
image = NULL;
if (header)
bpf_jit_binary_free(header);
prog = orig_prog;
goto out_addrs;
}
if (image) {
if (proglen != oldproglen) {
pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
proglen, oldproglen);
goto out_image;
}
break;
}
if (proglen == oldproglen) {
u32 align = __alignof__(struct exception_table_entry);
u32 extable_size = prog->aux->num_exentries *
sizeof(struct exception_table_entry);

/* allocate module memory for x86 insns and extable */
header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
&image, align, jit_fill_hole);
if (!header) {
prog = orig_prog;
goto out_addrs;
}
prog->aux->extable = (void *) image + roundup(proglen, align);
}
oldproglen = proglen;
cond_resched();
}

if (bpf_jit_enable > 1)
bpf_jit_dump(prog->len, proglen, pass + 1, image); /* 打印eBPF字节码编译后的机器码 */

if (image) {
if (!prog->is_func || extra_pass) {
bpf_tail_call_direct_fixup(prog);
bpf_jit_binary_lock_ro(header);
} else {
jit_data->addrs = addrs;
jit_data->ctx = ctx;
jit_data->proglen = proglen;
jit_data->image = image;
jit_data->header = header;
}
prog->bpf_func = (void *)image; /* 将eBPF执行函数设置成编译后的机器码 */
prog->jited = 1;
prog->jited_len = proglen;
} else {
prog = orig_prog;
}

if (!image || !prog->is_func || extra_pass) {
if (image)
bpf_prog_fill_jited_linfo(prog, addrs + 1);
out_addrs:
kfree(addrs);
kfree(jit_data);
prog->aux->jit_data = NULL;
}
out:
if (tmp_blinded)
bpf_jit_prog_release_other(prog, prog == orig_prog ?
tmp : orig_prog);
return prog;
}

当函数 do_jit 将 eBPF 码编译为字节码后,可以直接调用 prog->bpf_func 来执行字节码

当内核要执行 eBPF 字节码时,会调用原本位于 prog->bpf_func 的函数 __bpf_prog_run,该函数是 BPF 的核心函数入口,该函数被多个不同 stack size 的函数调用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
static const void * const jumptable[256] __annotate_jump_table = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
}; /* 维护了一个跳表,根据opcode来进行跳转 */
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
u32 tail_call_cnt = 0;

#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

select_insn:
goto *jumptable[insn->code];

/* ALU */
#define ALU(OPCODE, OP) \
ALU64_##OPCODE##_X: \
DST = DST OP SRC; \
CONT; \
ALU_##OPCODE##_X: \
DST = (u32) DST OP (u32) SRC; \
CONT; \
ALU64_##OPCODE##_K: \
DST = DST OP IMM; \
CONT; \
ALU_##OPCODE##_K: \
DST = (u32) DST OP (u32) IMM; \
CONT;

ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
ALU(LSH, <<)
ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
#undef ALU
ALU_NEG:
DST = (u32) -DST;
CONT;
ALU64_NEG:
DST = -DST;
CONT;
ALU_MOV_X:
DST = (u32) SRC;
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
DST = SRC;
CONT;
ALU64_MOV_K:
DST = IMM;
CONT;
LD_IMM_DW:
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
ALU_ARSH_X:
DST = (u64) (u32) (((s32) DST) >> SRC);
CONT;
ALU_ARSH_K:
DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
div64_u64_rem(DST, SRC, &AX);
DST = AX;
CONT;
ALU_MOD_X:
AX = (u32) DST;
DST = do_div(AX, (u32) SRC);
CONT;
ALU64_MOD_K:
div64_u64_rem(DST, IMM, &AX);
DST = AX;
CONT;
ALU_MOD_K:
AX = (u32) DST;
DST = do_div(AX, (u32) IMM);
CONT;
ALU64_DIV_X:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
AX = (u32) DST;
do_div(AX, (u32) SRC);
DST = (u32) AX;
CONT;
ALU64_DIV_K:
DST = div64_u64(DST, IMM);
CONT;
ALU_DIV_K:
AX = (u32) DST;
do_div(AX, (u32) IMM);
DST = (u32) AX;
CONT;
ALU_END_TO_BE:
switch (IMM) {
case 16:
DST = (__force u16) cpu_to_be16(DST);
break;
case 32:
DST = (__force u32) cpu_to_be32(DST);
break;
case 64:
DST = (__force u64) cpu_to_be64(DST);
break;
}
CONT;
ALU_END_TO_LE:
switch (IMM) {
case 16:
DST = (__force u16) cpu_to_le16(DST);
break;
case 32:
DST = (__force u32) cpu_to_le32(DST);
break;
case 64:
DST = (__force u64) cpu_to_le64(DST);
break;
}
CONT;

/* CALL */
JMP_CALL:
/* Function call scratches BPF_R1-BPF_R5 registers,
* preserves BPF_R6-BPF_R9, and stores return value
* into BPF_R0.
*/
BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
BPF_R4, BPF_R5);
CONT;

JMP_CALL_ARGS:
BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
BPF_R3, BPF_R4,
BPF_R5,
insn + insn->off + 1);
CONT;

JMP_TAIL_CALL: {
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
u32 index = BPF_R3;

if (unlikely(index >= array->map.max_entries))
goto out;
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
goto out;

tail_call_cnt++;

prog = READ_ONCE(array->ptrs[index]);
if (!prog)
goto out;

/* ARG1 at this point is guaranteed to point to CTX from
* the verifier side due to the fact that the tail call is
* handled like a helper, that is, bpf_tail_call_proto,
* where arg1_type is ARG_PTR_TO_CTX.
*/
insn = prog->insnsi;
goto select_insn;
out:
CONT;
}
JMP_JA:
insn += insn->off;
CONT;
JMP_EXIT:
return BPF_R0;
/* JMP */
#define COND_JMP(SIGN, OPCODE, CMP_OP) \
JMP_##OPCODE##_X: \
if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT; \
JMP32_##OPCODE##_X: \
if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT; \
JMP_##OPCODE##_K: \
if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT; \
JMP32_##OPCODE##_K: \
if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \
insn += insn->off; \
CONT_JMP; \
} \
CONT;
COND_JMP(u, JEQ, ==)
COND_JMP(u, JNE, !=)
COND_JMP(u, JGT, >)
COND_JMP(u, JLT, <)
COND_JMP(u, JGE, >=)
COND_JMP(u, JLE, <=)
COND_JMP(u, JSET, &)
COND_JMP(s, JSGT, >)
COND_JMP(s, JSLT, <)
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
/* STX and ST and LDX*/
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
CONT; \
ST_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
CONT; \
LDX_MEM_##SIZEOP: \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
CONT;

LDST(B, u8)
LDST(H, u16)
LDST(W, u32)
LDST(DW, u64)
#undef LDST
#define LDX_PROBE(SIZEOP, SIZE) \
LDX_PROBE_MEM_##SIZEOP: \
bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \
CONT;
LDX_PROBE(B, 1)
LDX_PROBE(H, 2)
LDX_PROBE(W, 4)
LDX_PROBE(DW, 8)
#undef LDX_PROBE

STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
atomic_add((u32) SRC, (atomic_t *)(unsigned long)
(DST + insn->off));
CONT;
STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
(DST + insn->off));
CONT;

default_label:
/* If we ever reach this, we have a bug somewhere. Die hard here
* instead of just returning 0; we could be somewhere in a subprog,
* so execution could continue otherwise which we do /not/ want.
*
* Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
*/
pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
BUG_ON(1);
return 0;
}