Linux-Lab10-Kernel Profiling

Kernel Profiling

实验目标：

熟悉 Linux 内核分析的基础知识
了解基本的分析工具
学习剖析方法和良好实践

本课程旨在将我们迄今为止在内核空间中所做的工作与现实世界的用例合并，在这些用例中，我们不编写内核空间代码，但我们使用分析工具查看内核，以便调试我们在编写常规的低级应用程序时遇到的问题

本课程的另一个重点是学习调试软件问题的一般方法，我们将介绍一些工具，这些工具让我们从内核中深入了解应用程序的运行方式

1
2
3

make clean
LABS=kernel_profiling make skels
make build

在使用 I/O 时，我们必须记住，与内存（速度快一个数量级）和调度（处理CPU上当前运行的内容）相比，它是操作系统中最慢的系统之一

Investigating Reduced Responsiveness

在插入 io.ko 模块时会降低系统的响应能力，我们看到命令行在键入命令时会断断续续，但是当运行顶部时，我们看到系统的负载不高，并且没有任何进程占用资源

了解 io.ko 模块正在做什么，以及为什么它会降低系统的响应能力

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/sched/task.h>

#define MY_MAJOR		42
#define MY_MINOR		0
#define MODULE_NAME		"deferred"

#define TIMER_TYPE_NONE		-1
#define TIMER_TYPE_SET		0
#define TIMER_TYPE_ALLOC	1
#define TIMER_TYPE_MON		2

MODULE_DESCRIPTION("Generate disruptive interrupts");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

struct timer_list timer;

static void timer_handler(struct timer_list *tl)
{
	unsigned long deadline = jiffies + HZ;

	while (jiffies < deadline) {
		(void)0;
	}
	mod_timer(&timer, jiffies + HZ);
}

static int deferred_init(void)
{
	int err;

	pr_info("[deferred_init] Init module\n");

	timer_setup(&timer, timer_handler, 0);
	mod_timer(&timer, jiffies + 5 * HZ);

	return 0;
}

static void deferred_exit(void)
{
	struct mon_proc *p, *n;

	pr_info("[deferred_exit] Exit module\n" );

	del_timer_sync(&timer);
}

module_init(deferred_init);
module_exit(deferred_exit);

加载内核模块 io.ko 后，程序 shell 有明显的卡顿
使用 top 命令：

Mem: 33064K used, 205848K free, 100K shrd, 292K buff, 4444K cached
CPU:   0% usr   0% sys   0% nic  99% idle   0% io   0% irq   0% sirq
Load average: 0.14 0.29 0.16 1/38 239
  PID  PPID USER     STAT   VSZ %VSZ %CPU COMMAND
  239   208 root     R     2972   1%   1% top
   10     2 root     IW       0   0%   0% [rcu_sched]
  208     1 root     S     2972   1%   0% -sh
  198     1 root     S     2828   1%   0% /sbin/syslogd -n -O /var/log/messages
  202     1 root     S     2828   1%   0% /sbin/klogd -n
  207     1 root     S     2828   1%   0% /sbin/getty 38400 tty1
  209     1 root     S     2828   1%   0% /sbin/getty 38400 tty2
  211     1 root     S     2828   1%   0% /sbin/getty 38400 tty4
  210     1 root     S     2828   1%   0% /sbin/getty 38400 tty3
  212     1 root     S     2828   1%   0% /sbin/getty 38400 tty5
  187     1 root     S     2828   1%   0% udhcpc -R -b -p /var/run/udhcpc.eth0.p
    1     0 root     S     2004   1%   0% init [5]
    9     2 root     SW       0   0%   0% [ksoftirqd/0]
   42     2 root     SWN      0   0%   0% [kmemleak]
   13     2 root     SW       0   0%   0% [kdevtmpfs]
   39     2 root     IW       0   0%   0% [kworker/0:2-eve]
    7     2 root     IW       0   0%   0% [kworker/u2:0-fl]
   34     2 root     IW<      0   0%   0% [kworker/0:1H-kb]
   38     2 root     IW       0   0%   0% [kworker/u2:1-fl]
    2     0 root     SW       0   0%   0% [kthreadd]

发现没有进程占用资源

Launching New Threads

执行调度二进制文件时，它会从 100 个正在运行的实例并行打印消息，有两种形式：

创建线程
创建进程

#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <pthread.h>
#include <stdlib.h>

void helper(int i) {
	printf("%d\n", i);
}

void * thread_start(void *arg) {

	helper((int) arg);
	pthread_exit(NULL);
}

int main(int argc, char *argv[]) {

	int pid = 0;
	pthread_t tid[300];
	struct timeval begin, end;

	if (argc < 1) {
		printf("./scheduling <mode>\n");
		return -1;
	}

	gettimeofday(&begin, NULL);

	for (int i = 0; i < 300; i++) {
		if (atoi(argv[1]) == 0) {
			pid = pthread_create(&tid[i], NULL, &thread_start, (void *) i);
			if (pid != 0) {
				break;
			}
		} else {
			pid = fork();
			if (pid == 0) {
				helper(i);
				break;
			}
		}
	}

	gettimeofday(&end, NULL);

	return 0;
}

进程：结果更加稳定，但速度慢
线程：速度更快，但结果不稳定

Tuning CP

我们的目标是在 linux 中编写一个集成在 Linux 中的 cp 工具的副本，该工具已由内存二进制文件实现，它实现了两种我们可以用于复制操作的方法：

使用 read 系统调用读取内存中缓冲区中源文件的内容，并使用 write 系统调用将该缓冲区写入目标文件
使用 mmap 系统调用将源文件和目标文件映射到内存，并将源文件的内容复制到内存中的目标文件

对比两种方法的性能：

调查两种复制机制中的哪一种更快（对于此步骤，您将使用1024块大小）
找到哪种复制机制更快，请更改块大小参数，看看哪个值能为您提供最佳副本

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>

int main(int argc, char *argv[]) {
	int src_fd, dst_fd, mode;
	struct stat st;
	unsigned long to_write, size, blk_size;
	char *src_p, *dst_p, *buf;

	if (argc < 3) {
		printf("./memory <mode> <blk_size> <src> <dst>\n");
		return -1;
	}

	mode = atoi(argv[1]);
	blk_size = atoi(argv[2]);

	printf("mode %d blk_size %ld src %s dst %s\n",
		mode, blk_size, argv[3], argv[4]);

	src_fd = open(argv[3], O_RDONLY);
	if (src_fd < 0)
		return src_fd;

	stat(argv[3], &st);
	size = to_write = st.st_size;

	if (mode == 0) {
		src_p = mmap(NULL, size, PROT_READ, MAP_SHARED, src_fd, 0);
		if (src_p < 0)
			return -1;
	}

	dst_fd = open(argv[4], O_CREAT | O_RDWR | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
	if (dst_fd < 0)
		return -1;

	ftruncate(dst_fd, size);

	if (mode == 0) {
		dst_p = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, dst_fd, 0);
		if (dst_p < 0)
			return -1;
	}

	buf = malloc(blk_size);

	while (to_write > blk_size) {
		if (mode == 0) {
			memcpy(dst_p, src_p, blk_size);
		} else {
			pread(src_fd, buf, blk_size, size - to_write);
			pwrite(dst_fd, buf, blk_size, size - to_write);
		}

		to_write -= blk_size;
		dst_p += blk_size;
		src_p += blk_size;
	}

	if (mode == 0) {
		memcpy(dst_p, src_p, to_write);
		msync(dst_p - size, size, MS_SYNC);
	} else {
		pread(src_fd, buf, to_write, to_write);
		pwrite(dst_fd, buf, blk_size, to_write);
	}
    
	close(src_fd);
	close(dst_fd);

	return 0;
}

结果：

root@qemux86:~/skels/kernel_profiling/3-memory# ./memory 1 10240000 1 2         
mode 1 blk_size 10240000 src 1 dst 2                                            
root@qemux86:~/skels/kernel_profiling/3-memory# ./memory 0 10240000 1 2         
mode 0 blk_size 10240000 src 1 dst 2

使用 read/write 有明显的延迟，使用 I/O，速度较慢
使用 mmap 几乎可以瞬间完成，使用映射，速度较快

I/O Latency

我们编写了一个读取磁盘内容的模块，插入 bio.ko 模块，我们看到系统负载出现较大峰值

Mem: 180032K used, 58880K free, 156K shrd, 324K buff, 15248K cached
CPU:   0% usr  46% sys   0% nic  51% idle   0% io   0% irq   1% sirq
Load average: 17.43 5.01 1.75 1/77 371
  PID  PPID USER     STAT   VSZ %VSZ %CPU COMMAND
   34     2 root     IW<      0   0%  24% [kworker/0:1H-kb]
    9     2 root     SW       0   0%   4% [ksoftirqd/0]
   10     2 root     IW       0   0%   2% [rcu_sched]
  371   208 root     R     2972   1%   1% top
  361     2 root     DW       0   0%   1% [mykwriterd10]
  364     2 root     DW       0   0%   1% [mykwriterd13]
  365     2 root     DW       0   0%   1% [mykwriterd14]
  366     2 root     DW       0   0%   1% [mykwriterd15]
  367     2 root     DW       0   0%   1% [mykwriterd16]
  368     2 root     DW       0   0%   1% [mykwriterd17]
  369     2 root     DW       0   0%   1% [mykwriterd18]
  370     2 root     DW       0   0%   1% [mykwriterd19]
  353     2 root     DW       0   0%   1% [mykwriterd2]
  354     2 root     DW       0   0%   1% [mykwriterd3]
  355     2 root     DW       0   0%   1% [mykwriterd4]
  356     2 root     DW       0   0%   1% [mykwriterd5]
  357     2 root     DW       0   0%   1% [mykwriterd6]
  358     2 root     DW       0   0%   1% [mykwriterd7]
  359     2 root     DW       0   0%   1% [mykwriterd8]
  360     2 root     DW       0   0%   1% [mykwriterd9]

Bad ELF

我们设法构建了一个ELF文件（作为 Unikraft 构建的一部分），该文件在进行静态分析时是有效的，但无法执行

1 2	➜ 5-bad-elf git:(master) ✗ ./bad_elf [1] 7357 segmentation fault ./bad_elf

结果：

➜  5-bad-elf git:(master) ✗ readelf -a bad_elf
ELF 头：
  Magic：   7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00 
  类别:                              ELF64
  数据:                              2 补码，小端序 (little endian)
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI 版本:                          0
  类型:                              EXEC (可执行文件)
  系统架构:                          Advanced Micro Devices X86-64
  版本:                              0x1
  入口点地址：               0x400130
  程序头起点：          64 (bytes into file)
  Start of section headers:          60352 (bytes into file)
  标志：             0x0
  Size of this header:               64 (bytes)
  Size of program headers:           56 (bytes)
  Number of program headers:         3
  Size of section headers:           64 (bytes)
  Number of section headers:         10
  Section header string table index: 9

节头：
  [号] 名称              类型             地址              偏移量
       大小              全体大小          旗标   链接   信息   对齐
  [ 0]                   NULL             0000000000000000  00000000
       0000000000000000  0000000000000000           0     0     0
  [ 1] .text             PROGBITS         00000000004000f0  000000f0
       000000000000be11  0000000000000000 WAX       0     0     16
  [ 2] .uk_thread_initta NOBITS           000000000040bf01  0000bf01
       0000000000000007  0000000000000000  WA       0     0     1
  [ 3] .uk_inittab       PROGBITS         000000000040c000  0000bf20
       0000000000000008  0000000000000000   A       0     0     8
  [ 4] .rodata           PROGBITS         000000000040c020  0000bf40
       0000000000002b85  0000000000000000   A       0     0     32
  [ 5] .tbss             NOBITS           000000000040eba8  0000eac5
       0000000000000000  0000000000000000 WAT       0     0     1
  [ 6] .data             PROGBITS         000000000040ebb0  0000ead0
       0000000000000070  0000000000000000  WA       0     0     16
  [ 7] .bss              NOBITS           000000000040ec20  0000eb40
       00000000000003e8  0000000000000000  WA       0     0     32
  [ 8] .comment          PROGBITS         0000000000000000  0000eb40
       0000000000000029  0000000000000001  MS       0     0     1
  [ 9] .shstrtab         STRTAB           0000000000000000  0000eb69
       0000000000000052  0000000000000000           0     0     1
Key to Flags:
  W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
  L (link order), O (extra OS processing required), G (group), T (TLS),
  C (compressed), x (unknown), o (OS specific), E (exclude),
  l (large), p (processor specific)

There are no section groups in this file.

程序头：
  Type           Offset             VirtAddr           PhysAddr
                 FileSiz            MemSiz              Flags  Align
  LOAD           0x00000000000000f0 0x00000000004000f0 0x00000000004000f0
                 0x000000000000be11 0x000000000000be18  RWE    0x10
  LOAD           0x000000000000bf20 0x000000000040c000 0x000000000040c000
                 0x0000000000002c20 0x0000000000003008  RW     0x20
  GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
                 0x0000000000000000 0x0000000000000000  RWE    0x10

 Section to Segment mapping:
  段节...
   00     .text .uk_thread_inittab 
   01     .uk_inittab .rodata .data .bss 
   02     

There is no dynamic section in this file.

该文件中没有重定位信息。

The decoding of unwind sections for machine type Advanced Micro Devices X86-64 is not currently supported.

No version information found in this file.