0%

Linux-Lab10-Kernel Profiling

Kernel Profiling

实验目标:

  • 熟悉 Linux 内核分析的基础知识
  • 了解基本的分析工具
  • 学习剖析方法和良好实践

本课程旨在将我们迄今为止在内核空间中所做的工作与现实世界的用例合并,在这些用例中,我们不编写内核空间代码,但我们使用分析工具查看内核,以便调试我们在编写常规的低级应用程序时遇到的问题

本课程的另一个重点是学习调试软件问题的一般方法,我们将介绍一些工具,这些工具让我们从内核中深入了解应用程序的运行方式

1
2
3
make clean
LABS=kernel_profiling make skels
make build

在使用 I/O 时,我们必须记住,与内存(速度快一个数量级)和调度(处理CPU上当前运行的内容)相比,它是操作系统中最慢的系统之一

Investigating Reduced Responsiveness

在插入 io.ko 模块时会降低系统的响应能力,我们看到命令行在键入命令时会断断续续,但是当运行顶部时,我们看到系统的负载不高,并且没有任何进程占用资源

了解 io.ko 模块正在做什么,以及为什么它会降低系统的响应能力

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/sched/task.h>

#define MY_MAJOR 42
#define MY_MINOR 0
#define MODULE_NAME "deferred"

#define TIMER_TYPE_NONE -1
#define TIMER_TYPE_SET 0
#define TIMER_TYPE_ALLOC 1
#define TIMER_TYPE_MON 2

MODULE_DESCRIPTION("Generate disruptive interrupts");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

struct timer_list timer;

static void timer_handler(struct timer_list *tl)
{
unsigned long deadline = jiffies + HZ;

while (jiffies < deadline) {
(void)0;
}
mod_timer(&timer, jiffies + HZ);
}

static int deferred_init(void)
{
int err;

pr_info("[deferred_init] Init module\n");

timer_setup(&timer, timer_handler, 0);
mod_timer(&timer, jiffies + 5 * HZ);

return 0;
}

static void deferred_exit(void)
{
struct mon_proc *p, *n;

pr_info("[deferred_exit] Exit module\n" );

del_timer_sync(&timer);
}

module_init(deferred_init);
module_exit(deferred_exit);
  • 加载内核模块 io.ko 后,程序 shell 有明显的卡顿
  • 使用 top 命令:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
Mem: 33064K used, 205848K free, 100K shrd, 292K buff, 4444K cached
CPU: 0% usr 0% sys 0% nic 99% idle 0% io 0% irq 0% sirq
Load average: 0.14 0.29 0.16 1/38 239
PID PPID USER STAT VSZ %VSZ %CPU COMMAND
239 208 root R 2972 1% 1% top
10 2 root IW 0 0% 0% [rcu_sched]
208 1 root S 2972 1% 0% -sh
198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages
202 1 root S 2828 1% 0% /sbin/klogd -n
207 1 root S 2828 1% 0% /sbin/getty 38400 tty1
209 1 root S 2828 1% 0% /sbin/getty 38400 tty2
211 1 root S 2828 1% 0% /sbin/getty 38400 tty4
210 1 root S 2828 1% 0% /sbin/getty 38400 tty3
212 1 root S 2828 1% 0% /sbin/getty 38400 tty5
187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p
1 0 root S 2004 1% 0% init [5]
9 2 root SW 0 0% 0% [ksoftirqd/0]
42 2 root SWN 0 0% 0% [kmemleak]
13 2 root SW 0 0% 0% [kdevtmpfs]
39 2 root IW 0 0% 0% [kworker/0:2-eve]
7 2 root IW 0 0% 0% [kworker/u2:0-fl]
34 2 root IW< 0 0% 0% [kworker/0:1H-kb]
38 2 root IW 0 0% 0% [kworker/u2:1-fl]
2 0 root SW 0 0% 0% [kthreadd]
  • 发现没有进程占用资源

Launching New Threads

执行调度二进制文件时,它会从 100 个正在运行的实例并行打印消息,有两种形式:

  • 创建线程
  • 创建进程
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <pthread.h>
#include <stdlib.h>

void helper(int i) {
printf("%d\n", i);
}

void * thread_start(void *arg) {

helper((int) arg);
pthread_exit(NULL);
}

int main(int argc, char *argv[]) {

int pid = 0;
pthread_t tid[300];
struct timeval begin, end;

if (argc < 1) {
printf("./scheduling <mode>\n");
return -1;
}

gettimeofday(&begin, NULL);

for (int i = 0; i < 300; i++) {
if (atoi(argv[1]) == 0) {
pid = pthread_create(&tid[i], NULL, &thread_start, (void *) i);
if (pid != 0) {
break;
}
} else {
pid = fork();
if (pid == 0) {
helper(i);
break;
}
}
}

gettimeofday(&end, NULL);

return 0;
}
  • 进程:结果更加稳定,但速度慢
  • 线程:速度更快,但结果不稳定

Tuning CP

我们的目标是在 linux 中编写一个集成在 Linux 中的 cp 工具的副本,该工具已由内存二进制文件实现,它实现了两种我们可以用于复制操作的方法:

  • 使用 read 系统调用读取内存中缓冲区中源文件的内容,并使用 write 系统调用将该缓冲区写入目标文件
  • 使用 mmap 系统调用将源文件和目标文件映射到内存,并将源文件的内容复制到内存中的目标文件

对比两种方法的性能:

  • 调查两种复制机制中的哪一种更快(对于此步骤,您将使用1024块大小)
  • 找到哪种复制机制更快,请更改块大小参数,看看哪个值能为您提供最佳副本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>

int main(int argc, char *argv[]) {
int src_fd, dst_fd, mode;
struct stat st;
unsigned long to_write, size, blk_size;
char *src_p, *dst_p, *buf;

if (argc < 3) {
printf("./memory <mode> <blk_size> <src> <dst>\n");
return -1;
}

mode = atoi(argv[1]);
blk_size = atoi(argv[2]);

printf("mode %d blk_size %ld src %s dst %s\n",
mode, blk_size, argv[3], argv[4]);

src_fd = open(argv[3], O_RDONLY);
if (src_fd < 0)
return src_fd;

stat(argv[3], &st);
size = to_write = st.st_size;

if (mode == 0) {
src_p = mmap(NULL, size, PROT_READ, MAP_SHARED, src_fd, 0);
if (src_p < 0)
return -1;
}

dst_fd = open(argv[4], O_CREAT | O_RDWR | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if (dst_fd < 0)
return -1;

ftruncate(dst_fd, size);

if (mode == 0) {
dst_p = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, dst_fd, 0);
if (dst_p < 0)
return -1;
}

buf = malloc(blk_size);

while (to_write > blk_size) {
if (mode == 0) {
memcpy(dst_p, src_p, blk_size);
} else {
pread(src_fd, buf, blk_size, size - to_write);
pwrite(dst_fd, buf, blk_size, size - to_write);
}

to_write -= blk_size;
dst_p += blk_size;
src_p += blk_size;
}

if (mode == 0) {
memcpy(dst_p, src_p, to_write);
msync(dst_p - size, size, MS_SYNC);
} else {
pread(src_fd, buf, to_write, to_write);
pwrite(dst_fd, buf, blk_size, to_write);
}

close(src_fd);
close(dst_fd);

return 0;
}
  • 结果:
1
2
3
4
root@qemux86:~/skels/kernel_profiling/3-memory# ./memory 1 10240000 1 2         
mode 1 blk_size 10240000 src 1 dst 2
root@qemux86:~/skels/kernel_profiling/3-memory# ./memory 0 10240000 1 2
mode 0 blk_size 10240000 src 1 dst 2
  • 使用 read/write 有明显的延迟,使用 I/O,速度较慢
  • 使用 mmap 几乎可以瞬间完成,使用映射,速度较快

I/O Latency

我们编写了一个读取磁盘内容的模块,插入 bio.ko 模块,我们看到系统负载出现较大峰值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
Mem: 180032K used, 58880K free, 156K shrd, 324K buff, 15248K cached
CPU: 0% usr 46% sys 0% nic 51% idle 0% io 0% irq 1% sirq
Load average: 17.43 5.01 1.75 1/77 371
PID PPID USER STAT VSZ %VSZ %CPU COMMAND
34 2 root IW< 0 0% 24% [kworker/0:1H-kb]
9 2 root SW 0 0% 4% [ksoftirqd/0]
10 2 root IW 0 0% 2% [rcu_sched]
371 208 root R 2972 1% 1% top
361 2 root DW 0 0% 1% [mykwriterd10]
364 2 root DW 0 0% 1% [mykwriterd13]
365 2 root DW 0 0% 1% [mykwriterd14]
366 2 root DW 0 0% 1% [mykwriterd15]
367 2 root DW 0 0% 1% [mykwriterd16]
368 2 root DW 0 0% 1% [mykwriterd17]
369 2 root DW 0 0% 1% [mykwriterd18]
370 2 root DW 0 0% 1% [mykwriterd19]
353 2 root DW 0 0% 1% [mykwriterd2]
354 2 root DW 0 0% 1% [mykwriterd3]
355 2 root DW 0 0% 1% [mykwriterd4]
356 2 root DW 0 0% 1% [mykwriterd5]
357 2 root DW 0 0% 1% [mykwriterd6]
358 2 root DW 0 0% 1% [mykwriterd7]
359 2 root DW 0 0% 1% [mykwriterd8]
360 2 root DW 0 0% 1% [mykwriterd9]

Bad ELF

我们设法构建了一个ELF文件(作为 Unikraft 构建的一部分),该文件在进行静态分析时是有效的,但无法执行

1
2
5-bad-elf git:(master) ✗ ./bad_elf 
[1] 7357 segmentation fault ./bad_elf
  • 结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
5-bad-elf git:(master) ✗ readelf -a bad_elf
ELF 头:
Magic: 7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00
类别: ELF64
数据: 2 补码,小端序 (little endian)
Version: 1 (current)
OS/ABI: UNIX - System V
ABI 版本: 0
类型: EXEC (可执行文件)
系统架构: Advanced Micro Devices X86-64
版本: 0x1
入口点地址: 0x400130
程序头起点: 64 (bytes into file)
Start of section headers: 60352 (bytes into file)
标志: 0x0
Size of this header: 64 (bytes)
Size of program headers: 56 (bytes)
Number of program headers: 3
Size of section headers: 64 (bytes)
Number of section headers: 10
Section header string table index: 9

节头:
[号] 名称 类型 地址 偏移量
大小 全体大小 旗标 链接 信息 对齐
[ 0] NULL 0000000000000000 00000000
0000000000000000 0000000000000000 0 0 0
[ 1] .text PROGBITS 00000000004000f0 000000f0
000000000000be11 0000000000000000 WAX 0 0 16
[ 2] .uk_thread_initta NOBITS 000000000040bf01 0000bf01
0000000000000007 0000000000000000 WA 0 0 1
[ 3] .uk_inittab PROGBITS 000000000040c000 0000bf20
0000000000000008 0000000000000000 A 0 0 8
[ 4] .rodata PROGBITS 000000000040c020 0000bf40
0000000000002b85 0000000000000000 A 0 0 32
[ 5] .tbss NOBITS 000000000040eba8 0000eac5
0000000000000000 0000000000000000 WAT 0 0 1
[ 6] .data PROGBITS 000000000040ebb0 0000ead0
0000000000000070 0000000000000000 WA 0 0 16
[ 7] .bss NOBITS 000000000040ec20 0000eb40
00000000000003e8 0000000000000000 WA 0 0 32
[ 8] .comment PROGBITS 0000000000000000 0000eb40
0000000000000029 0000000000000001 MS 0 0 1
[ 9] .shstrtab STRTAB 0000000000000000 0000eb69
0000000000000052 0000000000000000 0 0 1
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
L (link order), O (extra OS processing required), G (group), T (TLS),
C (compressed), x (unknown), o (OS specific), E (exclude),
l (large), p (processor specific)

There are no section groups in this file.

程序头:
Type Offset VirtAddr PhysAddr
FileSiz MemSiz Flags Align
LOAD 0x00000000000000f0 0x00000000004000f0 0x00000000004000f0
0x000000000000be11 0x000000000000be18 RWE 0x10
LOAD 0x000000000000bf20 0x000000000040c000 0x000000000040c000
0x0000000000002c20 0x0000000000003008 RW 0x20
GNU_STACK 0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 RWE 0x10

Section to Segment mapping:
段节...
00 .text .uk_thread_inittab
01 .uk_inittab .rodata .data .bss
02

There is no dynamic section in this file.

该文件中没有重定位信息。

The decoding of unwind sections for machine type Advanced Micro Devices X86-64 is not currently supported.

No version information found in this file.