0%

Kernel Profiling

实验目标:

  • 熟悉 Linux 内核分析的基础知识
  • 了解基本的分析工具
  • 学习剖析方法和良好实践

本课程旨在将我们迄今为止在内核空间中所做的工作与现实世界的用例合并,在这些用例中,我们不编写内核空间代码,但我们使用分析工具查看内核,以便调试我们在编写常规的低级应用程序时遇到的问题

本课程的另一个重点是学习调试软件问题的一般方法,我们将介绍一些工具,这些工具让我们从内核中深入了解应用程序的运行方式

1
2
3
make clean
LABS=kernel_profiling make skels
make build

在使用 I/O 时,我们必须记住,与内存(速度快一个数量级)和调度(处理CPU上当前运行的内容)相比,它是操作系统中最慢的系统之一

Investigating Reduced Responsiveness

在插入 io.ko 模块时会降低系统的响应能力,我们看到命令行在键入命令时会断断续续,但是当运行顶部时,我们看到系统的负载不高,并且没有任何进程占用资源

了解 io.ko 模块正在做什么,以及为什么它会降低系统的响应能力

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/sched/task.h>

#define MY_MAJOR 42
#define MY_MINOR 0
#define MODULE_NAME "deferred"

#define TIMER_TYPE_NONE -1
#define TIMER_TYPE_SET 0
#define TIMER_TYPE_ALLOC 1
#define TIMER_TYPE_MON 2

MODULE_DESCRIPTION("Generate disruptive interrupts");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

struct timer_list timer;

static void timer_handler(struct timer_list *tl)
{
unsigned long deadline = jiffies + HZ;

while (jiffies < deadline) {
(void)0;
}
mod_timer(&timer, jiffies + HZ);
}

static int deferred_init(void)
{
int err;

pr_info("[deferred_init] Init module\n");

timer_setup(&timer, timer_handler, 0);
mod_timer(&timer, jiffies + 5 * HZ);

return 0;
}

static void deferred_exit(void)
{
struct mon_proc *p, *n;

pr_info("[deferred_exit] Exit module\n" );

del_timer_sync(&timer);
}

module_init(deferred_init);
module_exit(deferred_exit);
  • 加载内核模块 io.ko 后,程序 shell 有明显的卡顿
  • 使用 top 命令:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
Mem: 33064K used, 205848K free, 100K shrd, 292K buff, 4444K cached
CPU: 0% usr 0% sys 0% nic 99% idle 0% io 0% irq 0% sirq
Load average: 0.14 0.29 0.16 1/38 239
PID PPID USER STAT VSZ %VSZ %CPU COMMAND
239 208 root R 2972 1% 1% top
10 2 root IW 0 0% 0% [rcu_sched]
208 1 root S 2972 1% 0% -sh
198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages
202 1 root S 2828 1% 0% /sbin/klogd -n
207 1 root S 2828 1% 0% /sbin/getty 38400 tty1
209 1 root S 2828 1% 0% /sbin/getty 38400 tty2
211 1 root S 2828 1% 0% /sbin/getty 38400 tty4
210 1 root S 2828 1% 0% /sbin/getty 38400 tty3
212 1 root S 2828 1% 0% /sbin/getty 38400 tty5
187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p
1 0 root S 2004 1% 0% init [5]
9 2 root SW 0 0% 0% [ksoftirqd/0]
42 2 root SWN 0 0% 0% [kmemleak]
13 2 root SW 0 0% 0% [kdevtmpfs]
39 2 root IW 0 0% 0% [kworker/0:2-eve]
7 2 root IW 0 0% 0% [kworker/u2:0-fl]
34 2 root IW< 0 0% 0% [kworker/0:1H-kb]
38 2 root IW 0 0% 0% [kworker/u2:1-fl]
2 0 root SW 0 0% 0% [kthreadd]
  • 发现没有进程占用资源

Launching New Threads

执行调度二进制文件时,它会从 100 个正在运行的实例并行打印消息,有两种形式:

  • 创建线程
  • 创建进程
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <pthread.h>
#include <stdlib.h>

void helper(int i) {
printf("%d\n", i);
}

void * thread_start(void *arg) {

helper((int) arg);
pthread_exit(NULL);
}

int main(int argc, char *argv[]) {

int pid = 0;
pthread_t tid[300];
struct timeval begin, end;

if (argc < 1) {
printf("./scheduling <mode>\n");
return -1;
}

gettimeofday(&begin, NULL);

for (int i = 0; i < 300; i++) {
if (atoi(argv[1]) == 0) {
pid = pthread_create(&tid[i], NULL, &thread_start, (void *) i);
if (pid != 0) {
break;
}
} else {
pid = fork();
if (pid == 0) {
helper(i);
break;
}
}
}

gettimeofday(&end, NULL);

return 0;
}
  • 进程:结果更加稳定,但速度慢
  • 线程:速度更快,但结果不稳定

Tuning CP

我们的目标是在 linux 中编写一个集成在 Linux 中的 cp 工具的副本,该工具已由内存二进制文件实现,它实现了两种我们可以用于复制操作的方法:

  • 使用 read 系统调用读取内存中缓冲区中源文件的内容,并使用 write 系统调用将该缓冲区写入目标文件
  • 使用 mmap 系统调用将源文件和目标文件映射到内存,并将源文件的内容复制到内存中的目标文件

对比两种方法的性能:

  • 调查两种复制机制中的哪一种更快(对于此步骤,您将使用1024块大小)
  • 找到哪种复制机制更快,请更改块大小参数,看看哪个值能为您提供最佳副本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>

int main(int argc, char *argv[]) {
int src_fd, dst_fd, mode;
struct stat st;
unsigned long to_write, size, blk_size;
char *src_p, *dst_p, *buf;

if (argc < 3) {
printf("./memory <mode> <blk_size> <src> <dst>\n");
return -1;
}

mode = atoi(argv[1]);
blk_size = atoi(argv[2]);

printf("mode %d blk_size %ld src %s dst %s\n",
mode, blk_size, argv[3], argv[4]);

src_fd = open(argv[3], O_RDONLY);
if (src_fd < 0)
return src_fd;

stat(argv[3], &st);
size = to_write = st.st_size;

if (mode == 0) {
src_p = mmap(NULL, size, PROT_READ, MAP_SHARED, src_fd, 0);
if (src_p < 0)
return -1;
}

dst_fd = open(argv[4], O_CREAT | O_RDWR | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if (dst_fd < 0)
return -1;

ftruncate(dst_fd, size);

if (mode == 0) {
dst_p = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, dst_fd, 0);
if (dst_p < 0)
return -1;
}

buf = malloc(blk_size);

while (to_write > blk_size) {
if (mode == 0) {
memcpy(dst_p, src_p, blk_size);
} else {
pread(src_fd, buf, blk_size, size - to_write);
pwrite(dst_fd, buf, blk_size, size - to_write);
}

to_write -= blk_size;
dst_p += blk_size;
src_p += blk_size;
}

if (mode == 0) {
memcpy(dst_p, src_p, to_write);
msync(dst_p - size, size, MS_SYNC);
} else {
pread(src_fd, buf, to_write, to_write);
pwrite(dst_fd, buf, blk_size, to_write);
}

close(src_fd);
close(dst_fd);

return 0;
}
  • 结果:
1
2
3
4
root@qemux86:~/skels/kernel_profiling/3-memory# ./memory 1 10240000 1 2         
mode 1 blk_size 10240000 src 1 dst 2
root@qemux86:~/skels/kernel_profiling/3-memory# ./memory 0 10240000 1 2
mode 0 blk_size 10240000 src 1 dst 2
  • 使用 read/write 有明显的延迟,使用 I/O,速度较慢
  • 使用 mmap 几乎可以瞬间完成,使用映射,速度较快

I/O Latency

我们编写了一个读取磁盘内容的模块,插入 bio.ko 模块,我们看到系统负载出现较大峰值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
Mem: 180032K used, 58880K free, 156K shrd, 324K buff, 15248K cached
CPU: 0% usr 46% sys 0% nic 51% idle 0% io 0% irq 1% sirq
Load average: 17.43 5.01 1.75 1/77 371
PID PPID USER STAT VSZ %VSZ %CPU COMMAND
34 2 root IW< 0 0% 24% [kworker/0:1H-kb]
9 2 root SW 0 0% 4% [ksoftirqd/0]
10 2 root IW 0 0% 2% [rcu_sched]
371 208 root R 2972 1% 1% top
361 2 root DW 0 0% 1% [mykwriterd10]
364 2 root DW 0 0% 1% [mykwriterd13]
365 2 root DW 0 0% 1% [mykwriterd14]
366 2 root DW 0 0% 1% [mykwriterd15]
367 2 root DW 0 0% 1% [mykwriterd16]
368 2 root DW 0 0% 1% [mykwriterd17]
369 2 root DW 0 0% 1% [mykwriterd18]
370 2 root DW 0 0% 1% [mykwriterd19]
353 2 root DW 0 0% 1% [mykwriterd2]
354 2 root DW 0 0% 1% [mykwriterd3]
355 2 root DW 0 0% 1% [mykwriterd4]
356 2 root DW 0 0% 1% [mykwriterd5]
357 2 root DW 0 0% 1% [mykwriterd6]
358 2 root DW 0 0% 1% [mykwriterd7]
359 2 root DW 0 0% 1% [mykwriterd8]
360 2 root DW 0 0% 1% [mykwriterd9]

Bad ELF

我们设法构建了一个ELF文件(作为 Unikraft 构建的一部分),该文件在进行静态分析时是有效的,但无法执行

1
2
5-bad-elf git:(master) ✗ ./bad_elf 
[1] 7357 segmentation fault ./bad_elf
  • 结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
5-bad-elf git:(master) ✗ readelf -a bad_elf
ELF 头:
Magic: 7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00
类别: ELF64
数据: 2 补码,小端序 (little endian)
Version: 1 (current)
OS/ABI: UNIX - System V
ABI 版本: 0
类型: EXEC (可执行文件)
系统架构: Advanced Micro Devices X86-64
版本: 0x1
入口点地址: 0x400130
程序头起点: 64 (bytes into file)
Start of section headers: 60352 (bytes into file)
标志: 0x0
Size of this header: 64 (bytes)
Size of program headers: 56 (bytes)
Number of program headers: 3
Size of section headers: 64 (bytes)
Number of section headers: 10
Section header string table index: 9

节头:
[号] 名称 类型 地址 偏移量
大小 全体大小 旗标 链接 信息 对齐
[ 0] NULL 0000000000000000 00000000
0000000000000000 0000000000000000 0 0 0
[ 1] .text PROGBITS 00000000004000f0 000000f0
000000000000be11 0000000000000000 WAX 0 0 16
[ 2] .uk_thread_initta NOBITS 000000000040bf01 0000bf01
0000000000000007 0000000000000000 WA 0 0 1
[ 3] .uk_inittab PROGBITS 000000000040c000 0000bf20
0000000000000008 0000000000000000 A 0 0 8
[ 4] .rodata PROGBITS 000000000040c020 0000bf40
0000000000002b85 0000000000000000 A 0 0 32
[ 5] .tbss NOBITS 000000000040eba8 0000eac5
0000000000000000 0000000000000000 WAT 0 0 1
[ 6] .data PROGBITS 000000000040ebb0 0000ead0
0000000000000070 0000000000000000 WA 0 0 16
[ 7] .bss NOBITS 000000000040ec20 0000eb40
00000000000003e8 0000000000000000 WA 0 0 32
[ 8] .comment PROGBITS 0000000000000000 0000eb40
0000000000000029 0000000000000001 MS 0 0 1
[ 9] .shstrtab STRTAB 0000000000000000 0000eb69
0000000000000052 0000000000000000 0 0 1
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
L (link order), O (extra OS processing required), G (group), T (TLS),
C (compressed), x (unknown), o (OS specific), E (exclude),
l (large), p (processor specific)

There are no section groups in this file.

程序头:
Type Offset VirtAddr PhysAddr
FileSiz MemSiz Flags Align
LOAD 0x00000000000000f0 0x00000000004000f0 0x00000000004000f0
0x000000000000be11 0x000000000000be18 RWE 0x10
LOAD 0x000000000000bf20 0x000000000040c000 0x000000000040c000
0x0000000000002c20 0x0000000000003008 RW 0x20
GNU_STACK 0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 RWE 0x10

Section to Segment mapping:
段节...
00 .text .uk_thread_inittab
01 .uk_inittab .rodata .data .bss
02

There is no dynamic section in this file.

该文件中没有重定位信息。

The decoding of unwind sections for machine type Advanced Micro Devices X86-64 is not currently supported.

No version information found in this file.

Kernel Development on ARM

实验目标:

  • 了解片上系统 System on a Chip(SoC)的含义
  • 熟悉使用 ARM 作为受支持架构的嵌入式世界
  • 了解主板支持包的含义(BSP)
  • 以 i.MX6UL 平台为例,使用 Qemu 编译和引导 ARM 内核
  • 使用设备树熟悉硬件描述

片上系统 System on a Chip 是将整个系统集成到其上的集成电路(IC)

通常在 SoC 上可以找到的组件包括中央处理单元(CPU)、内存、输入/输出端口、存储设备以及更复杂的模块,如音频数字接口、神经处理单元(NPU)或图形处理单元(GPU)

SoC 可用于各种最常见的应用,包括:

  • 消费类电子产品(电视机、手机、视频游戏机)
  • 工业计算机(医学成像等)
  • 汽车
  • 家电

对于 SoCs,最好使用 ARM 架构,使用的标准指令集架构是 RISC-V(精简指令集计算机,Reduced Instruction Set Computer-RISC)

ARM 平台的简化视图如下图所示:

  • 一或者多个 CPU
  • 一个系统总线
  • 时钟和复位模块(PLL,OSC,复位控制器)
  • 中断控制器
  • 定时器
  • 内存控制器
  • 外设控制器

以下是 i.MX6UL 平台的完整框图:

Board Support package

Board Support Package(BSP)是允许演示特定硬件平台功能的最小软件包集:

  • 工具链 Toolchain:为ARM平台生成可执行代码的交叉编译器
1
2
sudo apt-get install gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf # for arm32
sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu # for arm64
  • BootLoader 引导加载程序
  • Linux 内核映像、设备树文件和驱动程序
  • Rootfs 根文件系统

Device tree

设备树 DT 是用于描述系统中硬件设备的树结构,树中的每个节点描述一个设备,因此称为设备节点,引入 DT 是为了提供一种发现 “non-discoverable 不可发现” 的硬件的方法

设备树存储在设备树源(.dts)中,并编译为设备树 blob(.dtb)

  • 一个节点可能包含以该格式排列的多个属性,名称是一个字符串,值可以是字节,字符串,字符串数组
  • 下面是一个示例:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/ {
node@0 {
empty-property;
string-property = "string value";
string-list-property = "string value 1", "string value 2";
int-list-property = <value1 value2>;

child-node@0 {
child-empty-property;
child-string-property = "string value";
child-node-reference = <&child-node1>;
};

child-node1: child-node@1 {
child-empty-property;
child-string-property = "string value";
};
};
};

Rootfs

根文件系统是挂载在文件层次结构顶部的文件系统,它至少应包含允许系统引导到 shell 的关键文件

  • 我们将使用 Yocto 根图像,为了下载一个 rootfs 映像,需要运行:
1
2
$ cd tools/labs/
$ ARCH=arm make core-image-minimal-qemuarm.ext4

Exercises

要解决练习,您需要执行以下步骤:

  • 从模板准备 skeletons
  • 构建模块
  • 将模块复制到虚拟机
  • 启动 VM 并在 VM 中测试模块
1
2
3
make clean
LABS=arm_kernel_development make skels
make build

ARM 内核编译:

1
2
3
$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make menuconfig
# set FSL_ASRC=n and DRM_MXSFB=n
$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8

内核模块编译:

1
2
3
4
5
6
# modules build
tools/labs $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make build
# modules copy
tools/labs $ ARCH=arm make copy
# kernel build
tools/labs $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8

ARM 内核启动:

1
qemu-system-arm -M mcimx6ul-evk -cpu cortex-a7 -m 512M -kernel arch/arm/boot/zImage -nographic  -dtb arch/arm/boot/dts/imx6ul-14x14-evk.dtb -append "root=/dev/mmcblk0 rw console=ttymxc0 loglevel=8 earlycon printk" -sd tools/labs/core-image-minimal-qemux86.ext4 

Networking

实验目标:

  • 了解 Linux 内核网络体系结构
  • 使用数据包过滤器或防火墙,学习 IP 数据包管理技能
  • 熟悉如何在 Linux 内核级别使用套接字

互联网的发展导致网络应用的指数级增长,从而增加了操作系统网络子系统的速度和生产力要求

  • 网络子系统不是操作系统内核的基本组件(Linux 内核可以在没有网络支持的情况下编译)
  • 然而,计算系统(甚至是嵌入式设备)不太可能具有非网络操作系统
  • 现代操作系统使用 TCP/IP 堆栈,由内核实现传输层的协议,而应用层协议通常在用户空间(HTTP,FTP,SSH等)中实现

Networking in user space

在用户空间中,网络通信的抽象是套接字 socket

套接字 socket 抽象了一个通信通道,是基于内核的 TCP/IP 堆栈交互接口(其实 TCP/IP 的底层就是发包,通过发包实现计算机网络之间的交互,而 socket 让发包变得更简单了)

Networking in Linux kernel

Linux 内核为处理网络数据包提供了三种基本结构:

struct socket

  • 一个非常接近用户空间的抽象,即用于编程网络应用的 BSD 套接字
1
2
3
4
5
6
7
8
9
struct socket {
socket_state state; /* 表示socket所处的状态 */
short type; /* 该socket的类型 */
unsigned long flags; /* 标志位 */
struct socket_wq *wq; /* 等待该socket的进程队列和异步通知队列 */
struct file *file; /* 与之关联的file */
struct sock *sk; /* 与之关联的sock */
const struct proto_ops *ops; /* 协议相关的一组操作集 */
};
  • 相关联的 API 如下:
1
2
3
4
5
/* socket create/delete */
int sock_create(int family, int type, int protocol, struct socket **res); /* 在系统调用socket()执行后创建一个socket结构体 */
int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res); /* 创建一个内核套接字 */
int sock_create_lite(int family, int type, int protocol, struct socket **res); /* 创建不进行参数健全性检查的内核套接字 */
void sock_release(struct socket *sock); /* 关闭socket并释放关联的资源 */

struct sock

  • 在 Linux 术语中是套接字的网络表示形式,又称为 INET 套接字
  • 该结构用于存储有关连接状态的信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
struct sock {
/*
* Now struct inet_timewait_sock also uses sock_common, so please just
* don't add nothing before this first member (__sk_common) --acme
*/
struct sock_common __sk_common;

......

socket_lock_t sk_lock;
atomic_t sk_drops;
int sk_rcvlowat;
struct sk_buff_head sk_error_queue;
struct sk_buff_head sk_receive_queue;
struct {
atomic_t rmem_alloc;
int len;
struct sk_buff *head;
struct sk_buff *tail;
} sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc

int sk_forward_alloc;
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sk_ll_usec;
/* ===== mostly read cache line ===== */
unsigned int sk_napi_id;
#endif
int sk_rcvbuf;

struct sk_filter __rcu *sk_filter;
union {
struct socket_wq __rcu *sk_wq;
struct socket_wq *sk_wq_raw;
};
#ifdef CONFIG_XFRM
struct xfrm_policy __rcu *sk_policy[2];
#endif
struct dst_entry *sk_rx_dst;
struct dst_entry __rcu *sk_dst_cache;
atomic_t sk_omem_alloc;
int sk_sndbuf;

/* ===== cache line for TX ===== */
int sk_wmem_queued;
refcount_t sk_wmem_alloc;
unsigned long sk_tsq_flags;
union {
struct sk_buff *sk_send_head; /* 是用于数据传输的sk_buff列表 */
struct rb_root tcp_rtx_queue;
};
struct sk_buff_head sk_write_queue;
__s32 sk_peek_off;
int sk_write_pending;
__u32 sk_dst_pending_confirm;
u32 sk_pacing_status; /* see enum sk_pacing */
long sk_sndtimeo;
struct timer_list sk_timer;
__u32 sk_priority;
__u32 sk_mark;
unsigned long sk_pacing_rate; /* bytes per second */
unsigned long sk_max_pacing_rate;
struct page_frag sk_frag;
netdev_features_t sk_route_caps;
netdev_features_t sk_route_nocaps;
netdev_features_t sk_route_forced_caps;
int sk_gso_type;
unsigned int sk_gso_max_size;
gfp_t sk_allocation;
__u32 sk_txhash;

/*
* Because of non atomicity rules, all
* changes are protected by socket lock.
*/
unsigned int __sk_flags_offset[0];

......

unsigned int sk_padding : 1,
sk_kern_sock : 1,
sk_no_check_tx : 1,
sk_no_check_rx : 1,
sk_userlocks : 4,
sk_protocol : 8, /* 套接字使用的协议类型 */
sk_type : 16; /* 套接字类型(SOCK_STREAM,SOCK_DGRAM等) */
#define SK_PROTOCOL_MAX U8_MAX
u16 sk_gso_max_segs;
u8 sk_pacing_shift;
unsigned long sk_lingertime;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
int sk_err,
sk_err_soft;
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
kuid_t sk_uid;
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
ktime_t sk_stamp;
#if BITS_PER_LONG==32
seqlock_t sk_stamp_seq;
#endif
u16 sk_tsflags;
u8 sk_shutdown;
u32 sk_tskey;
atomic_t sk_zckey;

u8 sk_clockid;
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;

struct socket *sk_socket; /* 容纳它的BSD socket */
void *sk_user_data;
#ifdef CONFIG_SECURITY
void *sk_security;
#endif
struct sock_cgroup_data sk_cgrp_data;
struct mem_cgroup *sk_memcg;
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk);
void (*sk_write_space)(struct sock *sk);
void (*sk_error_report)(struct sock *sk);
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
#ifdef CONFIG_SOCK_VALIDATE_XMIT
struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk,
struct net_device *dev,
struct sk_buff *skb);
#endif
void (*sk_destruct)(struct sock *sk);
struct sock_reuseport __rcu *sk_reuseport_cb;
struct rcu_head sk_rcu;
};
  • 相关 API 如下:
1
2
3
4
5
/* sending/receiving messages */
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags); /* 从套接字socket(内核空间)接收消息 */
int sock_sendmsg(struct socket *sock, struct msghdr *msg); /* 利用套接字socket(内核空间)发送消息 */
int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags); /* 从套接字socket(内核空间)接收消息 */
int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size); /* 利用套接字socket(内核空间)发送消息 */

struct sk_buff

  • 用于描述一个网络数据包及其状态
  • 该结构是在用户空间或网络接口接收到内核数据包时创建的
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
struct sk_buff {
union {
struct {
/* These two members must be first. */
struct sk_buff *next; /* 用于链接sk_buff的链表 */
struct sk_buff *prev;

union {
struct net_device *dev; /* 发送或接收缓冲区的网络设备 */
unsigned long dev_scratch;
};
};
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list;
};

union {
struct sock *sk; /* 与缓冲区关联的sock */
int ip_defrag_offset;
};

union {
ktime_t tstamp;
u64 skb_mstamp_ns; /* earliest departure time */
};
char cb[48] __aligned(8);
union {
struct {
unsigned long _skb_refdst;
void (*destructor)(struct sk_buff *skb);
};
struct list_head tcp_tsorted_anchor;
};

#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
unsigned long _nfct;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info *nf_bridge;
#endif
unsigned int len, data_len;
__u16 mac_len, hdr_len;
__u16 queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK (1 << 7)
#else
#define CLONED_MASK 1
#endif
#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset)

__u8 __cloned_offset[0];
__u8 cloned:1,
nohdr:1,
fclone:2,
peeked:1,
head_frag:1,
xmit_more:1,
pfmemalloc:1;
/* private: */
__u32 headers_start[0];
/* public: */

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX (7 << 5)
#else
#define PKT_TYPE_MAX 7
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)

__u8 __pkt_type_offset[0];
__u8 pkt_type:3;
__u8 ignore_df:1;
__u8 nf_trace:1;
__u8 ip_summed:2;
__u8 ooo_okay:1;

__u8 l4_hash:1;
__u8 sw_hash:1;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
/* Indicates the inner headers are valid in the skbuff. */
__u8 encapsulation:1;
__u8 encap_hdr_csum:1;
__u8 csum_valid:1;

__u8 csum_complete_sw:1;
__u8 csum_level:2;
__u8 csum_not_inet:1;
__u8 dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ipvs_property:1;

__u8 inner_protocol_type:1;
__u8 remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
__u8 offload_fwd_mark:1;
__u8 offload_mr_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
__u8 tc_skip_classify:1;
__u8 tc_at_ingress:1;
__u8 tc_redirected:1;
__u8 tc_from_ingress:1;
#endif
#ifdef CONFIG_TLS_DEVICE
__u8 decrypted:1;
#endif

#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#endif

union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
__u32 priority;
int skb_iif;
__u32 hash;
__be16 vlan_proto;
__u16 vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
union {
unsigned int napi_id;
unsigned int sender_cpu;
};
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif

union {
__u32 mark;
__u32 reserved_tailroom;
};

union {
__be16 inner_protocol;
__u8 inner_ipproto;
};

__u16 inner_transport_header;
__u16 inner_network_header;
__u16 inner_mac_header;

__be16 protocol;
__u16 transport_header;
__u16 network_header;
__u16 mac_header;

/* private: */
__u32 headers_end[0];
/* public: */

/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;
refcount_t users;
};

Conversions

在不同的系统中,有几种方法可以对单词中的字节进行排序(字节序),包括:

  • Big Endian 大端
  • Little Endian 小端

由于网络将系统与不同的平台互连,因此互联网已经为数字数据的存储强加了一个标准的顺序,被称为 network byte-order 网络字节顺序

  • 网络字节序就是 Big Endian 大端字节序
  • 对于转换大小端,我们使用以下宏:
1
2
3
4
u16 htons(u16 x) /* 将16位整数从主机字节顺序转换为网络字节顺序 */
u32 htonl(u32 x) /* 将32位整数从主机字节顺序转换为网络字节顺序 */
u16 ntohs(u16 x) /* 将16位整数从网络字节顺序转换为主机字节顺序 */
u32 ntohl(u32 x) /* 将32位整数从网络字节顺序转换为主机字节顺序 */

netfilter

网络过滤器 netfilter 是内核接口的名称,用于捕获网络数据包以修改/分析它们(用于过滤,NAT等)

用户空间通过 iptable 使用网络过滤器接口

在 Linux 内核中,使用网络过滤器的数据包捕获是通过附加钩子来完成的:

  • 可以根据需要在路径中的不同位置指定钩子,后跟内核网络数据包
  • 可以在此处找到组织结构图,其中包含路线后跟包裹以及钩子的可能区域

钩子 hook 是通过以下结构定义的:

1
2
3
4
5
6
7
8
9
10
11
12
typedef unsigned int nf_hookfn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state);

struct nf_hook_ops {
nf_hookfn *hook; /* 捕获网络数据包(作为结构发送的数据包)时,调用的处理程序(该字段是传递给处理程序的私有信息) */
struct net_device *dev; /* 要捕获的设备(网络接口) */
void *priv;
u_int8_t pf; /* 包装类型(PF_INET等) */
unsigned int hooknum; /* hook编号 */
int priority; /* 优先级 */
};
  • 钩子函数 hook 的签名中有一个 nf_hook_state 结构体,用于描述 hook 的状态信息,关键条目如下:
1
2
3
4
5
6
7
8
9
struct nf_hook_state {
unsigned int hook; /* hook编号 */
u_int8_t pf; /* 包装类型 */
struct net_device *in; /* 输入接口 */
struct net_device *out; /* 输出接口 */
struct sock *sk; /* 对应的sock(INET套接字) */
struct net *net; /* 对应的net(内核网络命名空间) */
int (*okfn)(struct net *, struct sock *, struct sk_buff *);
};

相关 API 如下:

1
2
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); /* 用于注册挂钩点 */
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); /* 用于注销挂钩点 */
1
2
3
4
int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
unsigned int n); /* 调用n次nf_register_net_hook */
void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
unsigned int n); /* 调用n次nf_unregister_net_hook */
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
unsigned int n)
{
unsigned int i;
int err = 0;

for (i = 0; i < n; i++) {
err = nf_register_net_hook(net, &reg[i]);
if (err)
goto err;
}
return err;

err:
if (i > 0)
nf_unregister_net_hooks(net, reg, i);
return err;
}
EXPORT_SYMBOL(nf_register_net_hooks);

void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
unsigned int hookcount)
{
unsigned int i;

for (i = 0; i < hookcount; i++)
nf_unregister_net_hook(net, &reg[i]);
}
EXPORT_SYMBOL(nf_unregister_net_hooks);

使用案例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/tcp.h>

static unsigned int my_nf_hookfn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct iphdr *iph = ip_hdr(skb); /* IP header */
/* iph->saddr - source IP address */
/* iph->daddr - destination IP address */
if (iph->protocol == IPPROTO_TCP && test_daddr(iph->daddr)) {
struct tcphdr *tcph = tcp_hdr(skb); /* TCP header */
}
else if (iph->protocol == IPPROTO_UDP) {
struct udphdr *udph = udp_hdr(skb); /* UDP header */
}

return NF_ACCEPT;
}

static struct nf_hook_ops my_nfho = {
.hook = my_nf_hookfn,
.hooknum = NF_INET_LOCAL_OUT,
.pf = PF_INET,
.priority = NF_IP_PRI_FIRST
};

int __init my_hook_init(void)
{
return nf_register_net_hook(&init_net, &my_nfho);
}

void __exit my_hook_exit(void)
{
nf_unregister_net_hook(&init_net, &my_nfho);
}

module_init(my_hook_init);
module_exit(my_hook_exit);

netcat

在开发包含网络代码的应用程序时,最常用的工具之一是 netcat(也被称为“用于 TCP/IP 的瑞士军刀”),它允许:

  • 启动 TCP 连接
  • 等待 TCP 连接
  • 发送和接收 UDP 数据包
  • 以十六进制转储格式显示流量
  • 建立连接后运行程序(例如,shell)
  • 在已发送的包中设置特殊选项

Exercises

要解决练习,您需要执行以下步骤:

  • 从模板准备 skeletons
  • 构建模块
  • 将模块复制到虚拟机
  • 启动 VM 并在 VM 中测试模块
1
2
3
make clean
LABS=networking make skels
make build

1.netfilter:

  • 编写一个内核模块,该模块显示启动出站连接的 TCP 数据包的源地址和端口
  • 可以通过 MY_IOCTL_FILTER_ADDRESS ioctl 调用来指定目标地址
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/*
* SO2 - Networking Lab (#10)
*
* Exercise #1, #2: simple netfilter module
*
* Code skeleton.
*/

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <asm/atomic.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/tcp.h>

#include "filter.h"

MODULE_DESCRIPTION("Simple netfilter module");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

#define LOG_LEVEL KERN_ALERT
#define MY_DEVICE "filter"

static struct cdev my_cdev;
static atomic_t ioctl_set;
static unsigned int ioctl_set_addr;


/* Test ioctl_set_addr if it has been set.
*/
static int test_daddr(unsigned int dst_addr)
{
int ret = 0;

/* TODO 2: return non-zero if address has been set
* *and* matches dst_addr
*/
if (atomic_read(&ioctl_set) == 1)
ret = (ioctl_set_addr == dst_addr);
else
ret = 1;
return ret;
}

/* TODO 1: netfilter hook function */
static unsigned int my_nf_hookfn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct iphdr *iph = ip_hdr(skb);

if (iph->protocol == IPPROTO_TCP && test_daddr(iph->daddr)) {
struct tcphdr *tcph = tcp_hdr(skb);
if (tcph->syn && !tcph->ack)
printk(LOG_LEVEL "IP address is %pI4:%u\n", &iph->saddr,ntohs(tcph->source));
}

return NF_ACCEPT;
}

static int my_open(struct inode *inode, struct file *file)
{
return 0;
}

static int my_close(struct inode *inode, struct file *file)
{
return 0;
}

static long my_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
case MY_IOCTL_FILTER_ADDRESS:
/* TODO 2: set filter address from arg */
if(copy_from_user(&ioctl_set_addr,(void*)arg,sizeof(ioctl_set_addr))){
return -EFAULT;
}
atomic_set(&ioctl_set,1);
break;
default:
return -ENOTTY;
}

return 0;
}

static const struct file_operations my_fops = {
.owner = THIS_MODULE,
.open = my_open,
.release = my_close,
.unlocked_ioctl = my_ioctl
};

/* TODO 1: define netfilter hook operations structure */
static struct nf_hook_ops my_nfho = {
.hook = my_nf_hookfn,
.hooknum = NF_INET_LOCAL_OUT,
.pf = PF_INET,
.priority = NF_IP_PRI_FIRST
};

int __init my_hook_init(void)
{
int err;

/* register filter device */
err = register_chrdev_region(MKDEV(MY_MAJOR, 0), 1, MY_DEVICE);
if (err != 0)
return err;

atomic_set(&ioctl_set, 0);
ioctl_set_addr = 0;

/* init & add device */
cdev_init(&my_cdev, &my_fops);
cdev_add(&my_cdev, MKDEV(MY_MAJOR, 0), 1);

/* TODO 1: register netfilter hook */
err = nf_register_net_hook(&init_net,&my_nfho);
if (err)
goto out;
return 0;

out:
/* cleanup */
cdev_del(&my_cdev);
unregister_chrdev_region(MKDEV(MY_MAJOR, 0), 1);

return err;
}

void __exit my_hook_exit(void)
{
/* TODO 1: unregister hook */
nf_unregister_net_hook(&init_net,&my_nfho);
/* cleanup device */
cdev_del(&my_cdev);
unregister_chrdev_region(MKDEV(MY_MAJOR, 0), 1);
}

module_init(my_hook_init);
module_exit(my_hook_exit);
  • 结果:
1
2
3
4
5
root@qemux86:~/skels/networking/1-2-netfilter/user# ./test-1.sh                 
filter: loading out-of-tree module taints kernel.
IP address is 127.0.0.1:45934
Should show up in filter.
Check dmesg output.
1
2
3
4
5
root@qemux86:~/skels/networking/1-2-netfilter/user# ./test-2.sh                 
IP address is 127.0.0.1:45936
Should show up in filter.
Should NOT show up in filter.
Check dmesg output.
  • 使用 nc 命令时,hook 函数 my_nf_hookfn 被执行了
  • my_ioctl 的 MY_IOCTL_FILTER_ADDRESS 命令执行以后,效验模块开启,由于 ioctl_set_addr != iph->daddr 因此 hook 函数没有执行
  • PS:当时眼瞎把 nf_hook_ops 初始化错了,导致后面一直报错,调试了很久

2.tcp-sock:

  • 编写一个内核模块,该模块创建一个 TCP 套接字,该套接字侦听环回接口上的端口 60000 上的连接(以 init_module 为单位)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/*
* SO2 - Networking Lab (#10)
*
* Exercise #3, #4: simple kernel TCP socket
*
* Code skeleton.
*/

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/fs.h>
#include <net/sock.h>

MODULE_DESCRIPTION("Simple kernel TCP socket");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

#define LOG_LEVEL KERN_ALERT
#define MY_TCP_PORT 60000
#define LISTEN_BACKLOG 5

#define ON 1
#define OFF 0
#define DEBUG ON

#if DEBUG == ON
#define LOG(s) \
do { \
printk(KERN_DEBUG s "\n"); \
} while (0)
#else
#define LOG(s) \
do {} while (0)
#endif

#define print_sock_address(addr) \
do { \
printk(LOG_LEVEL "connection established to " \
"%pI4:%d\n", \
&addr.sin_addr.s_addr, \
ntohs(addr.sin_port)); \
} while (0)

static struct socket *sock; /* listening (server) socket */
static struct socket *new_sock; /* communication socket */

int __init my_tcp_sock_init(void)
{
int err;
/* address to bind on */
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_port = htons(MY_TCP_PORT),
.sin_addr = { htonl(INADDR_LOOPBACK) }
};
int addrlen = sizeof(addr);
/* address of peer */
struct sockaddr_in raddr;

/* TODO 1: create listening socket */
err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (err < 0) {
printk("socket create wrong");
goto out;
}
/* TODO 1: bind socket to loopback on port MY_TCP_PORT */
err = sock->ops->bind(sock, (struct sockaddr *) &addr, sizeof(addr));
if (err < 0) {
printk("bind wrong!");
goto out_release;
}
/* TODO 1: start listening */
err = sock->ops->listen(sock, LISTEN_BACKLOG);
if (err < 0) {
printk("listen wrong!");
goto out_release;
}
/* TODO 2: create new socket for the accepted connection */
err = sock_create_lite(PF_INET, SOCK_STREAM, IPPROTO_TCP, &new_sock);
if (err < 0) {
printk("create socket");
goto out;
}
new_sock->ops = sock->ops;
/* TODO 2: accept a connection */
err = sock->ops->accept(sock, new_sock, 0, true);
if (err < 0) {
printk("accept wrong");
goto out_release_new_sock;
}
/* TODO 2: get the address of the peer and print it */
err = sock->ops->getname(new_sock, (struct sockaddr *) &raddr, 1);
if (err < 0) {
printk("not find name");
goto out_release_new_sock;
}
print_sock_address(raddr);
return 0;

out_release_new_sock:
/* TODO 2: cleanup socket for accepted connection */
sock_release(new_sock);
out_release:
/* TODO 1: cleanup listening socket */
sock_release(sock);
out:
return err;
}

void __exit my_tcp_sock_exit(void)
{
/* TODO 2: cleanup socket for accepted connection */
sock_release(new_sock);
/* TODO 1: cleanup listening socket */
sock_release(sock);
}

module_init(my_tcp_sock_init);
module_exit(my_tcp_sock_exit);
  • PS:这里的 sock->ops->bind sock->ops->listen sock->ops->accept 就是用户态同名函数的底层实现
  • 结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
root@qemux86:~/skels/networking/3-4-tcp-sock# ./test-4.sh                       
+ sleep 1
+ insmod tcp_sock.ko
+ netstat -tuan
Active Internet connections (servers and established)
Proto Recv-Q Send-Q Local Address Foreign Address State
tcp 0 0 127.0.0.1:60000 0.0.0.0:* LISTEN
+ echo+ sleep 3
+ ../netcat -q 4 127.0.0.1 60000 Should connect.
-p 60001
accept wrong
connection established to 127.0.0.1:60001
+ rmmod tcp_sock
  • 成功监听了目标端口

3.udp-sock:

  • 编写一个内核模块,用于创建 UDP 套接字,并将消息从套接字上的 MY_TEST_MESSAGE 宏发送到端口 60001 上的环回地址
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*
* SO2 - Networking Lab (#10)
*
* Bonus: simple kernel UDP socket
*
* Code skeleton.
*/

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/net.h>
#include <linux/in.h>
#include <net/sock.h>

MODULE_DESCRIPTION("Simple kernel UDP socket");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

#define LOG_LEVEL KERN_ALERT
#define MY_UDP_LOCAL_PORT 60000
#define MY_UDP_REMOTE_PORT 60001
#define MY_TEST_MESSAGE "kernelsocket\n"

#define ON 1
#define OFF 0
#define DEBUG ON

#if DEBUG == ON
#define LOG(s) \
do { \
printk(KERN_DEBUG s "\n"); \
} while (0)
#else
#define LOG(s) \
do {} while (0)
#endif

#define print_sock_address(addr) \
do { \
printk(LOG_LEVEL "connection established to " \
NIPQUAD_FMT ":%d\n", \
NIPQUAD(addr.sin_addr.s_addr), \
ntohs(addr.sin_port)); \
} while (0)

static struct socket *sock; /* UDP server */

/* send datagram */
static int my_udp_msgsend(struct socket *s)
{
/* address to send to */
struct sockaddr_in raddr = {
.sin_family = AF_INET,
.sin_port = htons(MY_UDP_REMOTE_PORT),
.sin_addr = { htonl(INADDR_LOOPBACK) }
};
int raddrlen = sizeof(raddr);
/* message */
struct msghdr msg;
struct iovec iov;
char *buffer = MY_TEST_MESSAGE;
int len = strlen(buffer) + 1;

/* TODO 1: build message */
iov.iov_base = buffer;
iov.iov_len = len;
msg.msg_flags = 0;
msg.msg_name = &raddr;
msg.msg_namelen = raddrlen;
msg.msg_control = NULL;
msg.msg_controllen = 0;

/* TODO 1: send the message down the socket and return the
* error code.
*/
return kernel_sendmsg(s, &msg, (struct kvec *) &iov, 1, len);

return 0;
}

int __init my_udp_sock_init(void)
{
int err;
/* address to bind on */
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_port = htons(MY_UDP_LOCAL_PORT),
.sin_addr = { htonl(INADDR_LOOPBACK) }
};
int addrlen = sizeof(addr);

/* TODO 1: create UDP socket */
err = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
if (err < 0) {
printk(LOG_LEVEL "can't create socket\n");
goto out;
}
/* TODO 1: bind socket to loopback on port MY_UDP_LOCAL_PORT */
err = sock->ops->bind(sock, (struct sockaddr *) &addr, addrlen);
if (err < 0) {
printk(LOG_LEVEL "can't bind socket\n");
goto out_release;
}
/* send message */
err = my_udp_msgsend(sock);
if (err < 0) {
printk(LOG_LEVEL "can't send message\n");
goto out_release;
}

return 0;

out_release:
/* TODO 1: release socket */
sock_release(sock);
out:
return err;
}

void __exit my_udp_sock_exit(void)
{
/* TODO 1: release socket */
sock_release(sock);
}

module_init(my_udp_sock_init);
module_exit(my_udp_sock_exit);
  • 本实验算一个比较套路的过程,把 msghdriovec 填充好就行了
  • 结果:
1
2
3
4
5
6
7
8
9
root@qemux86:~/skels/networking/5-udp-sock# ./test-5.sh                         
+ pid=241
+ sleep 1
+ ../netcat -l -u -p 60001
+ insmod udp_sock.ko
udp_sock: loading out-of-tree module taints kernel.
kernelsocket
+ rmmod udp_sock
+ kill 241
  • 感觉本实验的侧重点是对协议栈 API 的使用
  • 之后有时间去专门分析一下这些 API 的底层,了解一下协议堆栈具体的处理过程

File system drivers

实验室目标:

  • 获取有关Linux中虚拟文件系统(VFS)的知识,并了解有关“inode”,“dentry”,“file”,超级块和数据块的概念
  • 了解在 VFS 中挂载文件系统的过程
  • 了解各种文件系统类型,并了解具有物理支持的文件系统(在磁盘上)和没有物理支持的文件系统之间的差异

Virtual File System (VFS)

虚拟文件系统(也称为 VFS)是内核的一个组件,用于处理与文件和文件系统相关的所有系统调用

  • VFS 是用户和特定文件系统之间的通用接口
  • VFS 的抽象简化了文件系统的实现,并提供了多个文件系统的集成
  • 文件系统的实现就通过使用 VFS 提供的 API 来完成,通用硬件和 I/O 子系统通信部分由 VFS 处理

从功能的角度来看,文件系统可以分为:

  • 磁盘文件系统(ext3, ext4, xfs, fat, ntfs …… )
  • 网络文件系统(nfs, smbfs/cifs, ncp …… )
  • 虚拟文件系统(procfs, sysfs, sockfs, pipefs …… )

Linux 内核将 VFS 用于目录和文件的层次结构(树),使用挂载操作将新的文件系统添加为 VFS 子树

VFS 可以使用普通文件作为虚拟块设备,因此可以在普通文件上挂载磁盘文件系统,这样,可以创建文件系统堆栈

VFS 的基本思想是提供一个可以表示来自任何文件系统的文件的单个文件模型,文件系统驱动程序负责引入公分母,这样,内核就可以创建包含整个系统的单个目录结构,将有一个文件系统将成为根,其余的将挂载在其各个目录中

The general file system model

通用文件系统模型(任何实现的文件系统都需要简化为通用文件系统模型)由几个明确定义的实体组成:

  • 超级块 superblock
    • 超级块存储已挂载文件系统所需的信息:
      • 输入和块位置
      • 文件系统块大小
      • 最大文件名长度
      • 最大文件大小
      • 根输入节点的位置
    • 对于磁盘文件系统,超级块在磁盘的第一个块中有一个对应项(文件系统控制块)
  • 索引结点 inode
    • 保留有关一般意义上的文件的信息:常规文件,目录,特殊文件 (pipe,fifo),块设备,字符设备,链接,或任何可以抽象为文件的内容
    • 一个索引结点存储信息:
      • 文件类型
      • 文件大小
      • 访问权限
      • 访问或修改时间
      • 磁盘上数据的位置(指向包含数据的磁盘块的指针)
    • 像超级块一样,每个 inodes 都有一个磁盘对应物,磁盘上的索引节点通常被分组到一个专门的区域(inode 区域,与数据块区域分开),在某些文件系统中,inode 的等效项分布在文件系统结构(FAT)中
  • 文件 file
    • 文件是最接近用户的文件系统模型的组件,该结构仅作为 VFS 实体存在于内存中,并且在磁盘上没有物理对应项
    • 文件对象表示进程已打开的文件,维护以下信息:
      • 文件光标位置
      • 文件打开权限
      • 指向关联 inode 的指针(最终为其索引)
  • 目录项 dentry
    • 目录(目录条目)将索引节点与文件名相关联
    • 通常,dentry 结构包含两个字段:
      • 标识 inode 的整数
      • 表示其名称的字符串

这些实体是文件系统元数据(它们包含有关数据或其他元数据的信息),其中需要注意的就是 inodefile

从文件系统的角度来看,inode 表示文件:

  • inode 的属性是与文件关联的大小,权限,时间
  • inode 唯一标识文件系统中的文件

从用户的角度来看,file 表示文件:

  • file 的属性是 inode,文件名,文件打开属性,文件位置
  • 所有打开的文件都有与之关联的 file 结构体

Register and unregister filesystems

在单个系统上,不太可能有超过 5-6 个文件系统

因此,文件系统(或者更准确地说,文件系统类型)作为模块实现,并且可以随时加载或卸载

  • 描述特定文件系统的结构是 file_system_type
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
struct file_system_type {
const char *name;
int fs_flags;
#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;
struct hlist_head fs_supers;

struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key s_vfs_rename_key;
struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
};

为了能够动态加载/卸载文件系统模块,需要文件系统注册/注销的 API

将文件系统注册到内核中的操作,通常在模块初始化函数中执行,为了注册文件系统,需要完成如下的工作:

  • 填充 file_system_type 结构体(至少填充:name mount kill_sb fs_flags
  • 调用 register_filesystem 函数

卸载模块时,必须通过调用函数 unregister_filesystem 来注销文件系统

注册操作系统的案例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
static struct file_system_type ramfs_fs_type = {
.name = "ramfs",
.mount = ramfs_mount,
.kill_sb = ramfs_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};

static int __init init_ramfs_fs(void)
{
if (test_and_set_bit(0, &once))
return 0;
return register_filesystem(&ramfs_fs_type);
}

挂载文件系统时,内核会调用 file_system_type->mount,该函数会进行一组初始化并返回表示挂载点目录的 dentry 结构,最简单的做法是在 mount 中使用如下 API:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int)); /* 挂载存储在块设备上的文件系统 */

struct dentry *mount_single(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int)); /* 挂载在所有挂载操作之间共享的文件系统 */

struct dentry *mount_nodev(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int)); /* 挂载不在物理设备上的文件系统 */

struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
const struct super_operations *ops,
const struct dentry_operations *dops, unsigned long magic); /* 伪文件系统的帮助器函数(例如:sockfs,pipe,通常是无法挂载的文件系统的) */
  • 这些函数会获取一个指针,该指针指向将在超级块初始化后调用的函数,以完成驱动程序的初始化

卸载文件系统时,内核调用 kill_sb,它将会执行清理操作并调用以下 API 中的一个:

1
2
3
void kill_block_super(struct super_block *sb); /* 卸载块设备上的文件系统 */
void kill_anon_super(struct super_block *sb); /* 卸载虚拟文件系统 */
void kill_litter_super(struct super_block *sb); /* 卸载不在物理设备上的文件系统 */

Superblock in VFS

超级块既作为物理实体存在(磁盘上的实体),也作为 VFS 实体存在(结构体 super_block),超级块仅包含信息,用于从磁盘写入和读取元数据

超级块操作由以下结构描述:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb); /* 分配inode关联的资源 */
void (*destroy_inode)(struct inode *); /* 销毁inode关联的资源 */
void (*free_inode)(struct inode *);

void (*dirty_inode) (struct inode *, int flags);
int (*write_inode) (struct inode *, struct writeback_control *wbc); /* 写入inode关联的资源 */
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *); /* 在卸载时释放超级块时调用 */
int (*sync_fs)(struct super_block *sb, int wait);
int (*freeze_super) (struct super_block *);
int (*freeze_fs) (struct super_block *);
int (*thaw_super) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *); /* 当一个syscall完成时调用 */
int (*remount_fs) (struct super_block *, int *, char *); /* 当内核检测到重新挂载尝试时调用 */
void (*umount_begin) (struct super_block *);

int (*show_options)(struct seq_file *, struct dentry *);
int (*show_devname)(struct seq_file *, struct dentry *);
int (*show_path)(struct seq_file *, struct dentry *);
int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
struct dquot **(*get_dquots)(struct inode *);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
long (*nr_cached_objects)(struct super_block *,
struct shrink_control *);
long (*free_cached_objects)(struct super_block *,
struct shrink_control *);
};

有一些重要的 API 可以使用 super_operations

1
2
3
4
5
struct buffer_head *__bread(struct block_device *bdev, sector_t block, unsigned size); /* 读取结构block_device中具有给定块号block和给定大小size的块buffer_head,如果成功,则返回指向buffer_head结构的指针,否则返回NULL */
struct buffer_head *sb_bread(struct super_block *sb, sector_t block); /* 与上一个函数执行的操作相同,但读取块的大小取自超级块以及从中完成读取的设备 */
void mark_buffer_dirty(struct buffer_head *bh); /* 将缓冲区标记为脏Dirty */
void brelse(struct buffer_head *bh); /* 释放缓冲区使用的内存 */
void map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block); /* 将缓冲头与相应的扇区相关联 */

填充超级块的一个案例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#include <linux/pagemap.h>

#define RAMFS_MAGIC 0x858458f6

static const struct super_operations ramfs_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = ramfs_show_options,
};

static int ramfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct ramfs_fs_info *fsi;
struct inode *inode;
int err;

save_mount_options(sb, data);

fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
sb->s_fs_info = fsi;
if (!fsi)
return -ENOMEM;

err = ramfs_parse_options(data, &fsi->mount_opts);
if (err)
return err;

sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = RAMFS_MAGIC;
sb->s_op = &ramfs_ops;
sb->s_time_gran = 1;

inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
sb->s_root = d_make_root(inode);
if (!sb->s_root)
return -ENOMEM;

return 0;
}
  • 内核提供了通用函数来实现文件系统结构的操作
  • 例如,上述代码中使用的 generic_delete_inodesimple_statfs(一般都以 generic 或者 simple 开头)

Buffer cache

缓冲区缓存是一个内核子系统,用于处理来自块设备的缓存(读取和写入)块,缓冲区缓存使用的基本实体是 buffer_head 结构体:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
struct buffer_head {
unsigned long b_state; /* 缓冲区的状态 */
struct buffer_head *b_this_page;/* circular list of page's buffers */
struct page *b_page; /* the page this bh is mapped to */

sector_t b_blocknr; /* 设备上已加载或需要保存在磁盘上的块数 */
size_t b_size; /* 缓冲区大小 */
char *b_data; /* 指向Read/Write内存区域的指针(缓冲区主体) */

struct block_device *b_bdev; /* 指向块设备 */
bh_end_io_t *b_end_io; /* I/O completion */
void *b_private; /* reserved for b_end_io */
struct list_head b_assoc_buffers; /* associated with another mapping */
struct address_space *b_assoc_map; /* mapping this buffer is
associated with */
atomic_t b_count; /* users using this buffer_head */
spinlock_t b_uptodate_lock; /* Used by the first bh in a page, to
* serialise IO completion of other
* buffers in the page */
};

函数和有用的宏:

1
2
3
4
5
6
7
8
9
10
11
12
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size); /* 查找内存区域中的第一个零位(size参数表示搜索区域中的位数) */
int test_and_set_bit(int nr, unsigned long *addr); /* 设置一位并获取旧值 */
int test_and_clear_bit(int nr, unsigned long *addr); /* 删除一位并获取旧值 */
int test_and_change_bit(unsigned int nr, volatile unsigned long *p); /* 反转一位并获取旧值 */

#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) // 检查inode是否为目录
#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) // 检查inode是否为字符设备
#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) // 检查inode是否为块设备
#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) // 检查inode是否为常规文件
#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) // 检查inode是否为FIFO
#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) // 检查inode是否为链接
#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) // 检查inode是否为socket

The Inode Structure

索引节点 inode 是 UNIX 文件系统的重要组成部分,同时也是 VFS 的重要组成部分

索引节点是元数据(它具有有关信息的信息):

  • 索引节点唯一标识磁盘上的文件并保存有关该文件的信息(uid、gid、访问权限、访问时间、指向数据块的指针等)
  • 索引节点是指磁盘上的文件,一个 inode 可以关联任意数量的 file 结构(多个进程可以打开同一个文件,或者一个进程可以多次打开同一个文件)
  • 与 VFS 中的其他结构一样,它是一种通用结构,它涵盖了所有受支持的文件类型的选项,甚至包括那些没有关联磁盘实体(如 FAT)的文件类型
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
struct inode {
umode_t i_mode; /* 访问权限 */
unsigned short i_opflags;
kuid_t i_uid; /* uid */
kgid_t i_gid; /* gid */
unsigned int i_flags;

#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif

const struct inode_operations *i_op; /* 指向操作inode的回调函数集 */
struct super_block *i_sb; /* inode所属文件系统的超级块结构 */
struct address_space *i_mapping;

#ifdef CONFIG_SECURITY
void *i_security;
#endif

/* Stat data, not accessed from path walking */
unsigned long i_ino; /* inode的编号(唯一标识文件系统中的inode) */
union {
const unsigned int i_nlink; /* 链接计数,对于没有链接(硬链接或符号链接)的文件系统,此值始终设置为'1' */
unsigned int __i_nlink;
};
dev_t i_rdev; /* 挂载此文件系统的设备 */
loff_t i_size; /* 大小(以字节为单位) */
struct timespec64 i_atime; /* access time:上一次访问该inode的时间 */
struct timespec64 i_mtime; /* modify time:上一次修改该inode的时间 */
struct timespec64 i_ctime; /* change time:上一次该inode状态改变的时间 */
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
u8 i_blkbits; /* 用于块大小的位数 */
u8 i_write_hint;
blkcnt_t i_blocks; /* 文件使用的块数(这仅由配额子系统使用) */

#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif

/* Misc */
unsigned long i_state;
struct rw_semaphore i_rwsem;

unsigned long dirtied_when; /* jiffies of first dirtying */
unsigned long dirtied_time_when;

struct hlist_node i_hash;
struct list_head i_io_list; /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *i_wb; /* the associated cgroup wb */

/* foreign inode detection, see wbc_detach_inode() */
int i_wb_frn_winner;
u16 i_wb_frn_avg_time;
u16 i_wb_frn_history;
#endif
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
struct list_head i_wb_list; /* backing dev writeback list */
union {
struct hlist_head i_dentry;
struct rcu_head i_rcu;
};
atomic64_t i_version;
atomic_t i_count; /* inode计数器(指示有多少内核组件使用该inode) */
atomic_t i_dio_count;
atomic_t i_writecount;
#ifdef CONFIG_IMA
atomic_t i_readcount; /* struct files open RO */
#endif
const struct file_operations *i_fop; /* 指向操作file的回调函数集 */
struct file_lock_context *i_flctx;
struct address_space i_data;
struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;
struct cdev *i_cdev;
char *i_link;
unsigned i_dir_seq;
};

__u32 i_generation;

#ifdef CONFIG_FSNOTIFY
__u32 i_fsnotify_mask; /* all events this inode cares about */
struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
#endif

#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
struct fscrypt_info *i_crypt_info;
#endif

void *i_private; /* fs or device private pointer */
} __randomize_layout;

相关 API 如下:

1
2
3
4
5
6
7
struct inode *new_inode(struct super_block *sb); /* 创建一个新的inode,初始化字段i_nlink,i_blkbits,i_sbi_dev(设置为'1') */
void insert_inode_hash(struct inode *inode); /* 将传入的inode添加到inode的哈希表中,如果inode被标记为脏,它将写入磁盘 */
void mark_inode_dirty(struct inode *inode); /* 将井内脏标记为脏污后,将它写回磁盘 */
struct inode * iget_locked(struct super_block *, unsigned long); /* 从磁盘加载具有给定编号的inode */
void unlock_new_inode(struct inode *); /* 与iget_locked结合使用,释放inode上的锁 */
void iput(struct inode *); /* 告诉内核在inode上的工作已完成,如果没有其他程序使用该inode,它将被销毁(如果该inode为脏,则在写回磁盘后销毁) */
void make_bad_inode(struct inode *); /* 告诉内核不能使用该inode */

创建一个 inode:

  • 通常,此函数将调用 iget_locked 从 VFS 获取 inode 结构,如果 inode 是新创建的,则需要从磁盘读取对应的超级块(使用 sb_bread)并填写有用的信息
  • 例如文件系统 minix 的 minix_iget 函数 :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
struct inode *minix_iget(struct super_block *sb, unsigned long ino)
{
struct inode *inode;
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;

if (INODE_VERSION(inode) == MINIX_V1)
return V1_minix_iget(inode);
...
}

static struct inode *V1_minix_iget(struct inode *inode)
{
struct buffer_head * bh;
struct minix_inode * raw_inode;
struct minix_inode_info *minix_inode = minix_i(inode);
int i;

raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
if (!raw_inode) {
iget_failed(inode);
return ERR_PTR(-EIO);
...
}
  • minix_iget 会先调用 iget_locked 来获取具有给定编号的 inode
  • 如果没有成功获取,程序将调用 V1_minix_iget,进而调用 minix_V1_raw_inode 来从磁盘读取输入,然后使用读取信息完成 inode

The File Structure

文件结构对应于进程打开的文件,仅存在于内存中,与 inode 相关联

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
struct file {
union {
struct llist_node fu_llist; /* 文件对象链表 */
struct rcu_head fu_rcuhead; /* 释放之后的RCU链表 */
} f_u;
struct path f_path; /* 包含的目录项 */
struct inode *f_inode; /* 索引结点 */
const struct file_operations *f_op; /* 指向操作file的回调函数集 */

spinlock_t f_lock; /* 保护文件的自旋锁 */
enum rw_hint f_write_hint;
atomic_long_t f_count;
unsigned int f_flags; /* 文件标志:O_RDONLY,O_NONBLOCK,O_SYNC */
fmode_t f_mode; /* 文件读/写模式:FMODE_READ,FMODE_WRITE */
struct mutex f_pos_lock;
loff_t f_pos; /* 当前读写位置 */
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;

u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/* needed for tty driver, and maybe others */
void *private_data; /* 文件私有数据 */

#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct list_head f_ep_links;
struct list_head f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping; /* 指向该页所在地址空间描述结构的指针 */
errseq_t f_wb_err;
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
  • 文件系统的文件操作 file->f_op 是使用 inode->i_fop 字段初始化的,以便后续系统调用使用存储在 file->f_op 中的值
  • 结构体 file 中还有一个有意思的条目 address_space,值得单独分析一下(其实这个条目是由 inode->i_data 进行初始化的)

Address space operations

进程的地址空间和文件之间有着密切的联系:

  • 程序的执行几乎完全是通过将文件映射到进程地址空间来完成的(例如 execve
  • 由于此方法运行良好且非常通用,因此也可用于常规系统调用,如读取和写入

描述地址空间的结构是 address_space(也被称为地址空间描述符),并且使用它的操作由结构体 address_space_operations 描述,要初始化地址空间操作,必须填写 inode->i_mapping->a_ops

结构体 address_space 用于管理 “索引结点inode” 到 “内存页面-page” 的映射:

  • 一个文件对应一个 address_space 结构
  • 一个 address_space 与一个偏移量能够确定 page cacheswap cache 中的一个页面
  • 结构体 address_space 的条目如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
struct address_space {
struct inode *host; /* 指向对应的索引结点 */
struct xarray i_pages;
gfp_t gfp_mask;
atomic_t i_mmap_writable; /* 共享映射数VM_SHARED记数 */
struct rb_root_cached i_mmap; /* 优先搜索树的root */
struct rw_semaphore i_mmap_rwsem;
unsigned long nrpages; /* 页总数 */
unsigned long nrexceptional;
pgoff_t writeback_index; /* 回写的起始偏移 */
const struct address_space_operations *a_ops; /* 操作函数表 */
unsigned long flags; /* 掩码与错误标识 */
errseq_t wb_err;
spinlock_t private_lock; /* 私有address_space锁 */
struct list_head private_list; /* 私有address_space链表 */
void *private_data; /* 私有数据 */
} __attribute__((aligned(sizeof(long)))) __randomize_layout;

The Dentry Structure

目录项 Dentry 的主要任务是在 inode 和文件名之间建立链接,该结构的重要字段如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
struct dentry {
/* RCU lookup touched fields */
unsigned int d_flags; /* protected by d_lock */
seqcount_t d_seq; /* per dentry seqlock */
struct hlist_bl_node d_hash; /* lookup hash list */
struct dentry *d_parent; /* 与父目录关联的目录 */
struct qstr d_name; /* 包含dentry名称和名称长度的结构体 */
struct inode *d_inode; /* 此dentry引用的inode */
unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */

/* Ref lookup also touches following */
struct lockref d_lockref; /* per-dentry lock and refcount */
const struct dentry_operations *d_op; /* 操作dentry的回调函数集 */
struct super_block *d_sb; /* The root of the dentry tree */
unsigned long d_time; /* used by d_revalidate */
void *d_fsdata; /* 为实现dentry操作的文件系统保留的字段 */

union {
struct list_head d_lru; /* LRU list */
wait_queue_head_t *d_wait; /* in-lookup ones only */
};
struct list_head d_child; /* child of parent list */
struct list_head d_subdirs; /* our children */
/*
* d_alias and d_rcu can share memory
*/
union {
struct hlist_node d_alias; /* inode alias list */
struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */
struct rcu_head d_rcu;
} d_u;
} __randomize_layout;
  • 内核使用 Dentry 来构建并管理文件系统的目录树
  • 目录项在内核中起到了连接不同的文件对象 inode 的作用,进而起到了维护文件系统目录树的作用

Bitmap operations

使用文件系统时,管理信息(哪个 block 是空闲或忙碌,哪个 inode 是空闲或忙碌)使用位图 Bitmap 存储,为此,我们经常需要使用位运算,此类操作包括:

1
2
3
4
5
6
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size); /* 在bitmap指定范围内找到第一个zero bit的位置 */
unsigned long find_first_bit(const unsigned long *addr, unsigned long size); /* 在bitmap指定范围内找到第一个bit的位置 */
void set_bit(int nr, volatile void *addr); /* 将指针指向的数据的第nr位,置"1" */
void clear_bit(int nr, volatile void *addr); /* 将指针指向的数据的第nr位,置"0" */
int test_and_set_bit(int nr, volatile void *addr); /* 将指针指向的数据的第nr位,置"1",并返回原来这一位的值 */
int test_and_clear_bit(int nr, volatile void *addr); /* 将指针指向的数据的第nr位,置"0",并返回原来这一位的值 */

下面列出了一些用法示例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
unsigned int map;
unsigned char array_map[NUM_BYTES];
size_t idx;
int changed;

/* Find first zero bit in 32 bit integer. */
idx = find_first_zero_bit(&map, 32);
printk (KERN_ALERT "The %zu-th bit is the first zero bit.\n", idx);

/* Find first one bit in NUM_BYTES bytes array. */
idx = find_first_bit(array_map, NUM_BYTES * 8);
printk (KERN_ALERT "The %zu-th bit is the first one bit.\n", idx);

/*
* Clear the idx-th bit in integer.
* It is assumed idx is less the number of bits in integer.
*/
clear_bit(idx, &map);

/*
* Test and set the idx-th bit in array.
* It is assumed idx is less the number of bits in array.
*/
changed = __test_and_set_bit(idx, &sbi->imap);
if (changed)
printk(KERN_ALERT "%zu-th bit changed\n", idx);

Exercises

要解决练习,您需要执行以下步骤:

  • 从模板准备 skeletons
  • 构建模块
  • 将模块复制到虚拟机
  • 启动 VM 并在 VM 中测试模块
1
2
3
make clean
LABS=filesystems make skels
make build

1.myfs 完整代码:

首先,我们计划熟悉 Linux 内核和虚拟文件系统 (VFS) 组件公开的界面:

  • 设计并使用一个简单的虚拟文件系统(即没有物理磁盘支持)
  • 文件系统称为 myfs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
/*
* SO2 Lab - Filesystem drivers
* Exercise #1 (no-dev filesystem)
*/

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/pagemap.h>

MODULE_DESCRIPTION("Simple no-dev filesystem");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

#define MYFS_BLOCKSIZE 4096
#define MYFS_BLOCKSIZE_BITS 12
#define MYFS_MAGIC 0xbeefcafe
#define LOG_LEVEL KERN_ALERT

/* declarations of functions that are part of operation structures */

static int myfs_mknod(struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev);
static int myfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl);
static int myfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);

/* TODO 2: define super_operations structure */
static const struct super_operations myfs_ops = {
.statfs = simple_statfs,
.drop_inode = generic_drop_inode,
};

static const struct inode_operations myfs_dir_inode_operations = {
/* TODO 5: Fill dir inode operations structure. */
.create = myfs_create,
.lookup = simple_lookup,
.link = simple_link,
.unlink = simple_unlink,
.mkdir = myfs_mkdir,
.rmdir = simple_rmdir,
.mknod = myfs_mknod,
.rename = simple_rename,
};

static const struct file_operations myfs_file_operations = {
/* TODO 6: Fill file operations structure. */
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.llseek = generic_file_llseek,
};

static const struct inode_operations myfs_file_inode_operations = {
/* TODO 6: Fill file inode operations structure. */
.getattr = simple_getattr,
};

static const struct address_space_operations myfs_aops = {
/* TODO 6: Fill address space operations structure. */
.readpage = simple_readpage,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
};

struct inode *myfs_get_inode(struct super_block *sb, const struct inode *dir,
int mode)
{
struct inode *inode = new_inode(sb);

if (!inode)
return NULL;

/* TODO 3: fill inode structure
* - mode
* - uid
* - gid
* - atime,ctime,mtime
* - ino
*/
inode_init_owner(inode, dir, mode);
inode->i_atime = current_time(inode);
inode->i_mtime = current_time(inode);
inode->i_ctime = current_time(inode);
inode->i_ino = 1;

/* TODO 5: Init i_ino using get_next_ino */
inode->i_ino = get_next_ino();
/* TODO 6: Initialize address space operations. */
inode->i_mapping->a_ops = &myfs_aops;

if (S_ISDIR(mode)) {
/* TODO 3: set inode operations for dir inodes. */
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* TODO 5: use myfs_dir_inode_operations for inode
* operations (i_op).
*/
inode->i_op = &myfs_dir_inode_operations;
/* TODO 3: directory inodes start off with i_nlink == 2 (for "." entry).
* Directory link count should be incremented (use inc_nlink).
*/
inc_nlink(inode);
}

/* TODO 6: Set file inode and file operations for regular files
* (use the S_ISREG macro).
*/
if (S_ISREG(mode)) {
inode->i_op = &myfs_file_inode_operations;
inode->i_fop = &myfs_file_operations;
}
return inode;
}

/* TODO 5: Implement myfs_mknod, myfs_create, myfs_mkdir. */
static int myfs_mknod(struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode = myfs_get_inode(dir->i_sb, dir, mode);

if (inode == NULL)
return -ENOSPC;

d_instantiate(dentry, inode);
dget(dentry);
dir->i_mtime = dir->i_ctime = current_time(inode);

return 0;
}

static int myfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
return myfs_mknod(dir, dentry, mode | S_IFREG, 0);
}

static int myfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int ret;

ret = myfs_mknod(dir, dentry, mode | S_IFDIR, 0);
if (ret != 0)
return ret;
inc_nlink(dir);
return 0;
}

static int myfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *root_inode;
struct dentry *root_dentry;

/* TODO 2: fill super_block
* - blocksize, blocksize_bits
* - magic
* - super operations
* - maxbytes
*/
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = MYFS_BLOCKSIZE;
sb->s_blocksize_bits = MYFS_BLOCKSIZE_BITS;
sb->s_magic = MYFS_MAGIC;
sb->s_op = &myfs_ops;

/* mode = directory & access rights (755) */
root_inode = myfs_get_inode(sb, NULL,
S_IFDIR | S_IRWXU | S_IRGRP |
S_IXGRP | S_IROTH | S_IXOTH);

printk(LOG_LEVEL "root inode has %d link(s)\n", root_inode->i_nlink);

if (!root_inode)
return -ENOMEM;

root_dentry = d_make_root(root_inode);
if (!root_dentry)
goto out_no_root;
sb->s_root = root_dentry;

return 0;

out_no_root:
iput(root_inode);
return -ENOMEM;
}

static struct dentry *myfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
/* TODO 1: call superblock mount function */
return mount_nodev(fs_type, flags, data, myfs_fill_super);
}

/* TODO 1: define file_system_type structure */
static struct file_system_type my_fs_type = {
.owner = THIS_MODULE,
.name = "myfs",
.mount = myfs_mount,
.kill_sb = kill_litter_super,
};

static int __init myfs_init(void)
{
int err;

/* TODO 1: register */
err = register_filesystem(&my_fs_type);
if (err) {
printk(LOG_LEVEL "register_filesystem failed\n");
return err;
}

return 0;
}

static void __exit myfs_exit(void)
{
/* TODO 1: unregister */
unregister_filesystem(&my_fs_type);
}

module_init(myfs_init);
module_exit(myfs_exit);
  • 第一次写文件系统驱动,很多东西都不懂,所以很大程度上参考了答案
  • 感觉我自己写的时候就是 API 操作不熟悉,在网上不一定能找到正确的 API,有些 API 有特殊的运用场景,不能随便使用
  • 根据传入参数和返回值可以判断一些 API 是否符合使用场景,但后来懒得一个一个试就直接看答案了

2.minfs 完整代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
/*
* SO2 Lab - Filesystem drivers
* Exercise #2 (dev filesystem)
*/

#include <linux/buffer_head.h>
#include <linux/cred.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/slab.h>

#include "minfs.h"

MODULE_DESCRIPTION("Simple filesystem");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

#define LOG_LEVEL KERN_ALERT


struct minfs_sb_info {
__u8 version;
unsigned long imap;
struct buffer_head *sbh;
};

struct minfs_inode_info {
__u16 data_block;
struct inode vfs_inode;
};

/* declarations of functions that are part of operation structures */

static int minfs_readdir(struct file *filp, struct dir_context *ctx);
static struct dentry *minfs_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags);
static int minfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl);

/* dir and inode operation structures */

static const struct file_operations minfs_dir_operations = {
.read = generic_read_dir,
.iterate = minfs_readdir,
};

static const struct inode_operations minfs_dir_inode_operations = {
.lookup = minfs_lookup,
/* TODO 7: Use minfs_create as the create function. */
.create = minfs_create,
};

static const struct address_space_operations minfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
};

static const struct file_operations minfs_file_operations = {
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.llseek = generic_file_llseek,
};

static const struct inode_operations minfs_file_inode_operations = {
.getattr = simple_getattr,
};

static struct inode *minfs_iget(struct super_block *s, unsigned long ino)
{
struct minfs_inode *mi;
struct buffer_head *bh;
struct inode *inode;
struct minfs_inode_info *mii;

/* Allocate VFS inode. */
inode = iget_locked(s, ino);
if (inode == NULL) {
printk(LOG_LEVEL "error aquiring inode\n");
return ERR_PTR(-ENOMEM);
}

/* Return inode from cache */
if (!(inode->i_state & I_NEW))
return inode;

/* TODO 4: Read block with inodes. It's the second block on
* the device, i.e. the block with the index 1. This is the index
* to be passed to sb_bread().
*/
bh = sb_bread(s,1);
if(bh==NULL){
goto out_bad_sb;
}
/* TODO 4: Get inode with index ino from the block. */
mi = ((struct minfs_inode *)bh->b_data) + ino;
/* TODO 4: fill VFS inode */
inode->i_mode = mi->mode;
inode->i_size = mi->size;
inode->i_blocks = mi->data_block;
i_uid_write(inode, mi->uid);
i_gid_write(inode, mi->gid);
inode->i_mtime = current_time(inode);
inode->i_atime = current_time(inode);
inode->i_ctime = current_time(inode);

/* TODO 7: Fill address space operations (inode->i_mapping->a_ops) */
inode->i_mapping->a_ops = &minfs_aops;
if (S_ISDIR(inode->i_mode)) {
/* TODO 4: Fill dir inode operations. */
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* TODO 5: Use minfs_dir_inode_operations for i_op
* and minfs_dir_operations for i_fop. */
inode->i_op = &minfs_dir_inode_operations;
inode->i_fop = &minfs_dir_operations;
/* TODO 4: Directory inodes start off with i_nlink == 2.
* (use inc_link) */
inc_nlink(inode);
}

/* TODO 7: Fill inode and file operations for regular files
* (i_op and i_fop). Use the S_ISREG macro.
*/
if(S_ISREG(inode->i_mode)){
inode->i_op = &minfs_file_inode_operations;
inode->i_fop = &minfs_file_operations;
}

/* fill data for mii */
mii = container_of(inode, struct minfs_inode_info, vfs_inode);
/* TODO 4: uncomment after the minfs_inode is initialized */
mii->data_block = mi->data_block;
/* Free resources. */
/* TODO 4: uncomment after the buffer_head is initialized */
brelse(bh);
unlock_new_inode(inode);

return inode;

out_bad_sb:
iget_failed(inode);
return NULL;
}

static int minfs_readdir(struct file *filp, struct dir_context *ctx)
{
struct buffer_head *bh;
struct minfs_dir_entry *de;
struct minfs_inode_info *mii;
struct inode *inode;
struct super_block *sb;
int over;
int err = 0;

/* TODO 5: Get inode of directory and container inode. */
inode = file_inode(filp);
mii = container_of(inode, struct minfs_inode_info, vfs_inode);
/* TODO 5: Get superblock from inode (i_sb). */
sb = inode->i_sb;
/* TODO 5: Read data block for directory inode. */
bh = sb_bread(sb, mii->data_block);
if (bh == NULL) {
err = -ENOMEM;
goto out_bad_sb;
}
for (; ctx->pos < MINFS_NUM_ENTRIES; ctx->pos++) {
/* TODO 5: Data block contains an array of
* "struct minfs_dir_entry". Use `de' for storing.
*/
de = (struct minfs_dir_entry *) bh->b_data + ctx->pos;
/* TODO 5: Step over empty entries (de->ino == 0). */
if (de->ino == 0) {
continue;
}
/*
* Use `over` to store return value of dir_emit and exit
* if required.
*/
over = dir_emit(ctx, de->name, MINFS_NAME_LEN, de->ino,
DT_UNKNOWN);
if (over) {
printk(KERN_DEBUG "Read %s from folder %s, ctx->pos: %lld\n",
de->name,
filp->f_path.dentry->d_name.name,
ctx->pos);
ctx->pos++;
goto done;
}
}

done:
brelse(bh);
out_bad_sb:
return err;
}

/*
* Find dentry in parent folder. Return parent folder's data buffer_head.
*/

static struct minfs_dir_entry *minfs_find_entry(struct dentry *dentry,
struct buffer_head **bhp)
{
struct buffer_head *bh;
struct inode *dir = dentry->d_parent->d_inode;
struct minfs_inode_info *mii = container_of(dir,
struct minfs_inode_info, vfs_inode);
struct super_block *sb = dir->i_sb;
const char *name = dentry->d_name.name;
struct minfs_dir_entry *final_de = NULL;
struct minfs_dir_entry *de;
int i;

/* TODO 6: Read parent folder data block (contains dentries).
* Fill bhp with return value.
*/
bh = sb_bread(sb,mii->data_block);
if (bh == NULL) {
return NULL;
}
*bhp = bh;
for (i = 0; i < MINFS_NUM_ENTRIES; i++) {
/* TODO 6: Traverse all entries, find entry by name
* Use `de' to traverse. Use `final_de' to store dentry
* found, if existing.
*/
de = ((struct minfs_dir_entry *) bh->b_data) + i;
if (de->ino != 0) {
/* found it */
if (strcmp(name, de->name) == 0) {
printk(KERN_DEBUG "Found entry %s on position: %zd\n",
name, i);
final_de = de;
break;
}
}
}

/* bh needs to be released by caller. */
return final_de;
}

static struct dentry *minfs_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
/* TODO 6: Comment line. */
return simple_lookup(dir, dentry, flags);

struct super_block *sb = dir->i_sb;
struct minfs_dir_entry *de;
struct buffer_head *bh = NULL;
struct inode *inode = NULL;

dentry->d_op = sb->s_root->d_op;

de = minfs_find_entry(dentry, &bh);
if (de != NULL) {
printk(KERN_DEBUG "getting entry: name: %s, ino: %d\n",
de->name, de->ino);
inode = minfs_iget(sb, de->ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
}

d_add(dentry, inode);
brelse(bh);

printk(KERN_DEBUG "looked up dentry %s\n", dentry->d_name.name);

return NULL;
}

static struct inode *minfs_alloc_inode(struct super_block *s)
{
struct minfs_inode_info *mii;

/* TODO 3: Allocate minfs_inode_info. */
mii = (struct minfs_inode_info *)kmalloc(sizeof(struct minfs_inode_info),0);
/* TODO 3: init VFS inode in minfs_inode_info */
inode_init_once(&mii->vfs_inode);
return &mii->vfs_inode;
}

static void minfs_destroy_inode(struct inode *inode)
{
/* TODO 3: free minfs_inode_info */
struct minfs_inode_info *mii = container_of(inode, struct minfs_inode_info, vfs_inode);
kfree(mii);
}

/*
* Create a new VFS inode. Do basic initialization and fill imap.
*/

static struct inode *minfs_new_inode(struct inode *dir)
{
struct super_block *sb = dir->i_sb;
struct minfs_sb_info *sbi = sb->s_fs_info;
struct inode *inode;
int idx;

/* TODO 7: Find first available inode. */
idx = find_first_zero_bit(&sbi->imap, MINFS_NUM_INODES);
if (idx == MINFS_NUM_INODES) {
printk(LOG_LEVEL "no space left in imap\n");
return NULL;
}
/* TODO 7: Mark the inode as used in the bitmap and mark
* the superblock buffer head as dirty.
*/
__test_and_set_bit(idx, &sbi->imap);
mark_buffer_dirty(sbi->sbh);

/* TODO 7: Call new_inode(), fill inode fields
* and insert inode into inode hash table.
*/
inode = new_inode(sb);
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_ino = idx;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_blocks = 0;
insert_inode_hash(inode);
/* Actual writing to the disk will be done in minfs_write_inode,
* which will be called at a later time.
*/

return inode;
}

/*
* Add dentry link on parent inode disk structure.
*/

static int minfs_add_link(struct dentry *dentry, struct inode *inode)
{
struct buffer_head *bh;
struct inode *dir;
struct super_block *sb;
struct minfs_inode_info *mii;
struct minfs_dir_entry *de;
int i;
int err = 0;

/* TODO 7: Get: directory inode (in inode); containing inode (in mii); superblock (in sb). */
dir = dentry->d_parent->d_inode;
mii = container_of(dir, struct minfs_inode_info, vfs_inode);
sb = dir->i_sb;
/* TODO 7: Read dir data block (use sb_bread). */
bh = sb_bread(sb, mii->data_block);
/* TODO 7: Find first free dentry (de->ino == 0). */
for (i = 0; i < MINFS_NUM_ENTRIES; i++) {
de = (struct minfs_dir_entry *) bh->b_data + i;
if (de->ino == 0)
break;
}

if (i == MINFS_NUM_ENTRIES) {
err = -ENOSPC;
goto out;
}
/* TODO 7: Place new entry in the available slot. Mark buffer_head
* as dirty. */
de->ino = inode->i_ino;
memcpy(de->name, dentry->d_name.name, MINFS_NAME_LEN);
dir->i_mtime = dir->i_ctime = current_time(inode);
mark_buffer_dirty(bh);

out:
brelse(bh);

return err;
}

/*
* Create a VFS file inode. Use minfs_file_... operations.
*/

static int minfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
struct inode *inode;
struct minfs_inode_info *mii;
int err;

inode = minfs_new_inode(dir);
if (inode == NULL) {
printk(LOG_LEVEL "error allocating new inode\n");
err = -ENOMEM;
goto err_new_inode;
}

inode->i_mode = mode;
inode->i_op = &minfs_file_inode_operations;
inode->i_fop = &minfs_file_operations;
mii = container_of(inode, struct minfs_inode_info, vfs_inode);
mii->data_block = MINFS_FIRST_DATA_BLOCK + inode->i_ino;

err = minfs_add_link(dentry, inode);
if (err != 0)
goto err_add_link;

d_instantiate(dentry, inode);
mark_inode_dirty(inode);

printk(KERN_DEBUG "new file inode created (ino = %lu)\n",
inode->i_ino);

return 0;

err_add_link:
inode_dec_link_count(inode);
iput(inode);
err_new_inode:
return err;
}

/*
* Write VFS inode contents to disk inode.
*/

static int minfs_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
struct super_block *sb = inode->i_sb;
struct minfs_inode *mi;
struct minfs_inode_info *mii = container_of(inode,
struct minfs_inode_info, vfs_inode);
struct buffer_head *bh;
int err = 0;

bh = sb_bread(sb, MINFS_INODE_BLOCK);
if (bh == NULL) {
printk(LOG_LEVEL "could not read block\n");
err = -ENOMEM;
goto out;
}

mi = (struct minfs_inode *) bh->b_data + inode->i_ino;

/* fill disk inode */
mi->mode = inode->i_mode;
mi->uid = i_uid_read(inode);
mi->gid = i_gid_read(inode);
mi->size = inode->i_size;
mi->data_block = mii->data_block;

printk(KERN_DEBUG "mode is %05o; data_block is %d\n", mi->mode,
mii->data_block);

mark_buffer_dirty(bh);
brelse(bh);

printk(KERN_DEBUG "wrote inode %lu\n", inode->i_ino);

out:
return err;
}

static void minfs_put_super(struct super_block *sb)
{
struct minfs_sb_info *sbi = sb->s_fs_info;

/* Free superblock buffer head. */
mark_buffer_dirty(sbi->sbh);
brelse(sbi->sbh);

printk(KERN_DEBUG "released superblock resources\n");
}

static const struct super_operations minfs_ops = {
.statfs = simple_statfs,
.put_super = minfs_put_super,
/* TODO 4: add alloc and destroy inode functions */
.alloc_inode = minfs_alloc_inode,
.destroy_inode = minfs_destroy_inode,
/* TODO 7: = set write_inode function. */
.write_inode = minfs_write_inode,
};

static int minfs_fill_super(struct super_block *s, void *data, int silent)
{
struct minfs_sb_info *sbi;
struct minfs_super_block *ms;
struct inode *root_inode;
struct dentry *root_dentry;
struct buffer_head *bh;
int ret = -EINVAL;

sbi = kzalloc(sizeof(struct minfs_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
s->s_fs_info = sbi;

/* Set block size for superblock. */
if (!sb_set_blocksize(s, MINFS_BLOCK_SIZE))
goto out_bad_blocksize;

/* TODO 2: Read block with superblock. It's the first block on
* the device, i.e. the block with the index 0. This is the index
* to be passed to sb_bread().
*/
bh = sb_bread(s,0);
if(bh == NULL){
goto out_bad_sb;
}
/* TODO 2: interpret read data as minfs_super_block */
ms = (struct minfs_super_block*)bh->b_data;
/* TODO 2: check magic number with value defined in minfs.h. jump to out_bad_magic if not suitable */
if(ms->magic != MINFS_MAGIC){
goto out_bad_magic;
}
/* TODO 2: fill super_block with magic_number, super_operations */
s->s_magic = MINFS_MAGIC;
s->s_op = &minfs_ops;
/* TODO 2: Fill sbi with rest of information from disk superblock
* (i.e. version).
*/
sbi->version = ms->version;
sbi->imap = ms->imap;
/* allocate root inode and root dentry */
/* TODO 2: use myfs_get_inode instead of minfs_iget */
root_inode = minfs_iget(s, MINFS_ROOT_INODE);
if (!root_inode)
goto out_bad_inode;

root_dentry = d_make_root(root_inode);
if (!root_dentry)
goto out_iput;
s->s_root = root_dentry;

/* Store superblock buffer_head for further use. */
sbi->sbh = bh;

return 0;

out_iput:
iput(root_inode);
out_bad_inode:
printk(LOG_LEVEL "bad inode\n");
out_bad_magic:
printk(LOG_LEVEL "bad magic number\n");
brelse(bh);
out_bad_sb:
printk(LOG_LEVEL "error reading buffer_head\n");
out_bad_blocksize:
printk(LOG_LEVEL "bad block size\n");
s->s_fs_info = NULL;
kfree(sbi);
return ret;
}

static struct dentry *minfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
/* TODO 1: call superblock mount function */
return mount_bdev(fs_type, flags, dev_name, data, minfs_fill_super);
}

static struct file_system_type minfs_fs_type = {
.owner = THIS_MODULE,
.name = "minfs",
/* TODO 1: add mount, kill_sb and fs_flags */
.mount = minfs_mount,
.kill_sb = kill_litter_super,
.fs_flags = FS_USERNS_MOUNT,
};

static int __init minfs_init(void)
{
int err;

err = register_filesystem(&minfs_fs_type);
if (err) {
printk(LOG_LEVEL "register_filesystem failed\n");
return err;
}

return 0;
}

static void __exit minfs_exit(void)
{
unregister_filesystem(&minfs_fs_type);
}

module_init(minfs_init);
module_exit(minfs_exit);
  • 结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
root@qemux86:~/skels/filesystems/minfs/user# set -ex
root@qemux86:~/skels/filesystems/minfs/user# insmod ../kernel/minfs.ko
+ insmod ../kernel/minfs.ko
minfs: loading out-of-tree module taints kernel.
root@qemux86:~/skels/filesystems/minfs/user# mkdir -p /mnt/minfs
+ mkdir -p /mnt/minfs
root@qemux86:~/skels/filesystems/minfs/user# ./mkfs.minfs /dev/vdb
+ ./mkfs.minfs /dev/vdb
root@qemux86:~/skels/filesystems/minfs/user# mount -t minfs /dev/vdb /mnt/minfs
+ mount -t minfs /dev/vdb /mnt/minfs
root@qemux86:~/skels/filesystems/minfs/user# cat /proc/filesystems | grep minfs
+ cat /proc/filesystems
+ grep minfs
nodev minfs
root@qemux86:~/skels/filesystems/minfs/user# cat /proc/mounts | grep minfs
+ + grep minfs
cat /proc/mounts
/dev/vdb /mnt/minfs minfs rw,relatime 0 0
root@qemux86:~/skels/filesystems/minfs/user# stat -f /mnt/minfs
+ stat -f /mnt/minfs
File: "/mnt/minfs"
ID: 0 Namelen: 255 Type: UNKNOWN
Block size: 4096
Blocks: Total: 0 Free: 0 Available: 0
Inodes: Total: 0 Free: 0
root@qemux86:~/skels/filesystems/minfs/user# cd /mnt/minfs
+ cd /mnt/minfs
root@qemux86:/mnt/minfs# ls -la
Read a.txt from folder /, ctx->pos: 0
ls: ./a.txt: No such file or directory
root@qemux86:/mnt/minfs# mode is 40755; data_block is 2
wrote inode 0

root@qemux86:/mnt/minfs# cd ..
root@qemux86:/mnt# umount /mnt/minfs
released superblock resources
root@qemux86:/mnt# rmmod minfs
  • 感觉本实验其实就主要完成了两个工作:
    • register_filesystem(&minfs_fs_type)unregister_filesystem(&minfs_fs_type)
    • 其他的操作都是对上面这两个操作的完善
  • 新注册文件系统只有一个操作是需要由我们完成的:minfs_mount
1
2
3
4
5
6
7
static struct file_system_type minfs_fs_type = {
.owner = THIS_MODULE,
.name = "minfs",
.mount = minfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_USERNS_MOUNT,
};
  • 而在 minfs_mount 我们又只需要完成用于填充超级块的函数 minfs_fill_super
  • 这一部分和模板差不多,使用 sb_bread 读出超级块,效验 magic number 并且把超级块的信息填入 minfs_sb_info,我们需要完成 minfs_ops 中的函数:
1
2
3
4
5
6
7
static const struct super_operations minfs_ops = {
.statfs = simple_statfs,
.put_super = minfs_put_super,
.alloc_inode = minfs_alloc_inode,
.destroy_inode = minfs_destroy_inode,
.write_inode = minfs_write_inode,
};
  • 另外程序用于读取 inode 的 minfs_iget 函数需要实现
  • minfs_iget 中:
    • 先是使用 iget_locked(s, ino) 从挂载的文件系统获取 inode
    • 然后就是对 inode 的初始化,分配 address_space_operations
    • 再根据 inode 类型为其分配对应的 inode_operationsfile_operations
    • 最后返回 inode
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
static const struct file_operations minfs_dir_operations = {
.read = generic_read_dir,
.iterate = minfs_readdir,
};

static const struct file_operations minfs_file_operations = {
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.llseek = generic_file_llseek,
};

static const struct inode_operations minfs_dir_inode_operations = {
.lookup = minfs_lookup,
.create = minfs_create,
};

static const struct inode_operations minfs_file_inode_operations = {
.getattr = simple_getattr,
};

static const struct address_space_operations minfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
};
  • 其中又需要我们实现的函数有:minfs_create minfs_lookup minfs_readdir
  • 借助参考答案和多次试错,感觉大体的流程清楚了,不过细节还需要打磨

Block Device Drivers

实验室目标:

  • 获取有关 Linux 上 I/O 子系统行为的知识
  • 块设备的结构和功能的实践活动
  • 通过解决练习,获得将API用于块设备的基本技能

块设备的特点是随机访问以固定大小的块组织的数据(此类设备的示例包括硬盘驱动器,CD-ROM驱动器,RAM磁盘等)

块设备的速度一般远高于字符设备的速度,它们的性能也很重要(这就是为什么 Linux 内核以不同的方式处理这两种类型的设备),因此,使用块设备比使用字符设备更复杂:

  • 字符设备具有单个当前位置
  • 块设备必须能够移动到设备中的任何位置以提供对数据的随机访问

为了简化块设备的使用,Linux 内核提供了一个称为块 I/O(或块层)子系统的整个子系统:

  • 从内核的角度来看,寻址的最小逻辑单元是块(尽管可以在扇区级别对物理设备进行寻址,但内核使用块执行所有磁盘操作)
  • 由于物理寻址的最小单位是扇区,因此块的大小必须是扇区大小的倍数(块的大小因所使用的文件系统而异,最常见的值是 512B、1KB 和 4KB)

Register a block I/O device

从 Linux 内核的 4.9 版开始,register_blkdev 调用是可选的,此函数执行的唯一操作是动态分配主要参数并在 /proc/devices 中创建条目(在将来的内核版本中,它可能会被删除,但是,大多数驱动程序仍然调用它)

通常,对寄存器函数的调用在模块初始化函数中执行,对取消注册函数的调用在模块退出函数中执行,典型方案如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <linux/fs.h>

#define MY_BLOCK_MAJOR 240
#define MY_BLKDEV_NAME "mybdev"

static int my_block_init(void)
{
int status;

status = register_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME);
if (status < 0) {
printk(KERN_ERR "unable to register mybdev block device\n");
return -EBUSY;
}
//...
}

static void my_block_exit(void)
{
//...
unregister_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME);
}

Register a disk

虽然函数 register_blkdev 注册了一个 major,但它不向系统提供设备(TYPE - disk),为了创建和使用块设备,使用 linux/genhd.h 中定义的专用接口:

1
2
3
#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) /* 分配一个块设备 */
void del_gendisk(struct gendisk *gp); /* 解除指定的块设备 */
void add_disk(struct gendisk *disk); /* 将磁盘添加到系统 */

使用案例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#include <linux/fs.h>
#include <linux/genhd.h>

#define MY_BLOCK_MINORS 1

static struct my_block_dev {
struct gendisk *gd;
//...
} dev;

static int create_block_device(struct my_block_dev *dev)
{
dev->gd = alloc_disk(MY_BLOCK_MINORS);
//...
add_disk(dev->gd);
}

static void delete_block_device(struct my_block_dev *dev)
{
if (dev->gd)
del_gendisk(dev->gd);
//...
}

static int my_block_init(void)
{
//...
create_block_device(&dev);
}

static void my_block_exit(void)
{
delete_block_device(&dev);
//...
}
  • 在调用函数 add_disk 后(实际上在调用期间),磁盘立即处于活动状态,并且可以随时调用其方法
  • 因此,在驱动程序完全初始化并准备好响应对已注册磁盘的请求之前,不应调用此函数

结构体 gendisk 存储有关磁盘的信息,这样的结构是从调用 alloc_disk 中获得的,在将其发送到函数 add_disk 之前必须填充其字段

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
struct gendisk {
int major; /* major number of driver */
int first_minor;
int minors;
char disk_name[DISK_NAME_LEN]; /*示在sysfs中和sysfs中显示的磁盘名称 */
unsigned short events; /* supported events */
unsigned short event_flags; /* flags related to event processing */
struct disk_part_tbl __rcu *part_tbl;
struct hd_struct part0;
const struct block_device_operations *fops; /* 表示与磁盘关联的操作 */
struct request_queue *queue; /* 表示请求队列 */
void *private_data; /* 指向私有数据的指针 */
int flags;
unsigned long state;
#define GD_NEED_PART_SCAN 0
struct rw_semaphore lookup_sem;
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct kobject integrity_kobj;
#endif /* CONFIG_BLK_DEV_INTEGRITY */
#if IS_ENABLED(CONFIG_CDROM)
struct cdrom_device_info *cdi;
#endif
int node_id;
struct badblocks *bb;
struct lockdep_map lockdep_map;
};

填充结构体 gendisk 的示例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include <linux/genhd.h>
#include <linux/fs.h>
#include <linux/blkdev.h>

#define NR_SECTORS 1024

#define KERNEL_SECTOR_SIZE 512

static struct my_block_dev {
//...
spinlock_t lock; /* For mutual exclusion */
struct request_queue *queue; /* The device request queue */
struct gendisk *gd; /* The gendisk structure */
//...
} dev;

static int create_block_device(struct my_block_dev *dev)
{
...
/* Initialize the gendisk structure */
dev->gd = alloc_disk(MY_BLOCK_MINORS);
if (!dev->gd) {
printk (KERN_NOTICE "alloc_disk failure\n");
return -ENOMEM;
}

dev->gd->major = MY_BLOCK_MAJOR;
dev->gd->first_minor = 0;
dev->gd->fops = &my_block_ops;
dev->gd->queue = dev->queue;
dev->gd->private_data = dev;
snprintf (dev->gd->disk_name, 32, "myblock");
set_capacity(dev->gd, NR_SECTORS);

add_disk(dev->gd);

return 0;
}

static int my_block_init(void)
{
int status;
//...
status = create_block_device(&dev);
if (status < 0)
return status;
//...
}

static void delete_block_device(struct my_block_dev *dev)
{
if (dev->gd) {
del_gendisk(dev->gd);
}
//...
}

static void my_block_exit(void)
{
delete_block_device(&dev);
//...
}

Request Queues Multi-Queue Block Layer

块设备的驱动程序使用请求队列来存储将要处理的块 I/O 请求:

  • 请求队列由结构表示 blk_mq_hw_ctx
  • 请求队列由请求及其关联控制信息的双链表组成
  • 请求通过更高级别的内核代码(例如,文件系统)添加到队列中

块设备驱动程序可以在前一个请求完成之前接受请求,因此,上层需要一种方法来知道请求何时完成,为此,在提交时向每个请求添加一个“标记”(用结构体 blk_mq_tag_set 来描述),并在请求完成后使用完成通知发回

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
struct blk_mq_tag_set {
struct blk_mq_queue_map map[HCTX_MAX_TYPES];
unsigned int nr_maps;
const struct blk_mq_ops *ops; /* 队列操作相关操作 */
unsigned int nr_hw_queues; /* 为设备分配的硬件队列数 */
unsigned int queue_depth; /* 硬件队列大小 */
unsigned int reserved_tags;
unsigned int cmd_size; /* 在设备末尾分配的额外字节数,如果需要,将由块设备驱动程序使用 */
int numa_node; /* 在NUMA系统中,这指的是存储设备连接到的节点的索引 */
unsigned int timeout;
unsigned int flags;
void *driver_data; /* 驱动程序专用数据 */
atomic_t active_queues_shared_sbitmap;

struct sbitmap_queue __bitmap_tags;
struct sbitmap_queue __breserved_tags;
struct blk_mq_tags **tags; /* 指向标签集数组的指针 */

struct mutex tag_list_lock;
struct list_head tag_list; /* 使用此标签集的请求队列链表 */
};

相关 API 如下:

1
2
3
4
5
6
7
8
9
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); /* 创建一个请求队列 */
void blk_cleanup_queue(struct request_queue *); /* 清除一个请求队列 */
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); /* 初始化tag条目后,为一个或者多个请求队列分配tag和request集合 */
void blk_mq_free_tag_set(struct blk_mq_tag_set *set); /* 销毁并释放tag */

void blk_mq_start_request(struct request *rq); /* 在开始处理请求之前调用并通知上层 */
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); /* 在队列中重新发送请求 */
void blk_mq_end_request(struct request *rq, blk_status_t error); /* 结束请求处理并通知上层 */
bool blk_rq_is_passthrough(struct request *rq); /* 验证请求类型 */

使用案例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>

static struct my_block_dev {
//...
struct blk_mq_tag_set tag_set;
struct request_queue *queue;
//...
} dev;

static blk_status_t my_block_request(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct request *rq = bd->rq;
struct my_block_dev *dev = q->queuedata;
blk_mq_start_request(rq);
if (blk_rq_is_passthrough(rq)) {
printk (KERN_NOTICE "Skip non-fs request\n");
blk_mq_end_request(rq, BLK_STS_IOERR);
goto out;
}
/* do work */
...
blk_mq_end_request(rq, BLK_STS_OK);
out:
return BLK_STS_OK;
}

static struct blk_mq_ops my_queue_ops = {
.queue_rq = my_block_request,
};

static int create_block_device(struct my_block_dev *dev)
{
/* Initialize tag set. */
dev->tag_set.ops = &my_queue_ops;
dev->tag_set.nr_hw_queues = 1;
dev->tag_set.queue_depth = 128;
dev->tag_set.numa_node = NUMA_NO_NODE;
dev->tag_set.cmd_size = 0;
dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
err = blk_mq_alloc_tag_set(&dev->tag_set);
if (err) {
goto out_err;
}

/* Allocate queue. */
dev->queue = blk_mq_init_queue(&dev->tag_set);
if (IS_ERR(dev->queue)) {
goto out_blk_init;
}

blk_queue_logical_block_size(dev->queue, KERNEL_SECTOR_SIZE);

/* Assign private data to queue structure. */
dev->queue->queuedata = dev;
//...

out_blk_init:
blk_mq_free_tag_set(&dev->tag_set);
out_err:
return -ENOMEM;
}

static int my_block_init(void)
{
int status;
//...
status = create_block_device(&dev);
if (status < 0)
return status;
//...
}

static void delete_block_device(struct block_dev *dev)
{
//...
blk_mq_free_tag_set(&dev->tag_set);
blk_cleanup_queue(dev->queue);
}

static void my_block_exit(void)
{
delete_block_device(&dev);
//...
}

Structure struct bio

Linux Block 层 作为 IO 子系统的中间层,他为上层输出接口,为下层提供数据,在整个 block 层的最小单位,不可分割

结构体 bio 用于描述一个内存块:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
struct bio {
struct bio *bi_next; /* request queue link */
struct gendisk *bi_disk; /* 表示一个独立的磁盘设备 */
unsigned int bi_opf; /* 标志信息 */

unsigned short bi_flags; /* status, etc and bvec pool number */
unsigned short bi_ioprio;
unsigned short bi_write_hint;
blk_status_t bi_status;
u8 bi_partno;
atomic_t __bi_remaining;

struct bvec_iter bi_iter; /* 迭代器(用来遍历bvec,也就是bio数据区) */

bio_end_io_t *bi_end_io;

void *bi_private;
#ifdef CONFIG_BLK_CGROUP
struct blkcg_gq *bi_blkg;
struct bio_issue bi_issue;
#ifdef CONFIG_BLK_CGROUP_IOCOST
u64 bi_iocost_cost;
#endif
#endif

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
struct bio_crypt_ctx *bi_crypt_context;
#endif

union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
};

unsigned short bi_vcnt; /* how many bio_vec's */
unsigned short bi_max_vecs; /* max bvl_vecs we can hold */

atomic_t __bi_cnt; /* pin count */

struct bio_vec *bi_io_vec; /* the actual vec list */
struct bio_set *bi_pool;
struct bio_vec bi_inline_vecs[];
};

相关 API 如下:

1
struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); /* 用于处理请求队列的有用函数 */
  • 调用 bio_alloc 以后,往往需要马上填充 bio 中的条目(尤其是:bi_diskbi_iterbi_opf),案例如下:
1
2
3
4
5
6
7
8
struct bio *bio = bio_alloc(GFP_NOIO, 1);
//...
bio->bi_disk = bdev->bd_disk;
bio->bi_iter.bi_sector = sector;
bio->bi_opf = REQ_OP_READ;
page = alloc_page(GFP_NOIO);
bio_add_page(bio, page, size, offset);
//...

如果想要对 bio 进行操作(增删改查),必须将该结构的支持页面映射到对应的内核地址空间,操作完毕后再把映射解除

  • 对于 mapping/unmapping 映射,请使用 kmap_atomickunmap_atomic
1
2
#define kmap_atomic(page) /* 为物理页page建立内存映射 */
#define kunmap_atomic(addr) /* 解除虚拟地址addr的内存映射 */

遍历 bio 并输出其关联内容的模板如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
static void my_block_transfer(struct my_block_dev *dev, size_t start,
size_t len, char *buffer, int dir);


static int my_xfer_bio(struct my_block_dev *dev, struct bio *bio)
{
struct bio_vec bvec;
struct bvec_iter iter;

/* Do each segment independently. */
bio_for_each_segment(bvec, bio, iter) {
sector_t sector = iter.bi_sector;
char *buffer = kmap_atomic(bvec.bv_page);
unsigned long offset = bvec.bv_offset;
size_t len = bvec.bv_len;
int dir = bio_data_dir(bio);
printk(KERN_LOG_LEVEL "%s: buf %8p offset %lu len %u dir %d\n", __func__, buffer, offset, len, dir);
/* process mapped buffer */
my_block_transfer(dev, sector, len, buffer + offset, dir);
kunmap_atomic(buffer);
}
return 0;
}

static int my_xfer_request(struct my_block_dev *dev, struct request *req)
{
struct bio_vec bvec;
struct req_iterator iter;

/* Do each segment independently. */
rq_for_each_segment(bvec, req, iter) {
sector_t sector = iter.iter.bi_sector;
char *buffer = kmap_atomic(bvec.bv_page);
unsigned long offset = bvec.bv_offset;
size_t len = bvec.bv_len;
int dir = bio_data_dir(bio);
printk(KERN_LOG_LEVEL "%s: buf %8p offset %lu len %u dir %d\n", __func__, buffer, offset, len, dir);
/* process mapped buffer */
my_block_transfer(dev, sector, len, buffer + offset, dir);
kunmap_atomic(buffer);
}
return 0;
}
  • 这两个模板比较固定,可以直接拿出来用

Exercises

要解决练习,您需要执行以下步骤:

  • 从模板准备 skeletons
  • 构建模块
  • 将模块复制到虚拟机
  • 启动 VM 并在 VM 中测试模块
1
2
3
make clean
LABS=block_device_drivers make skels
make build

Test1 完整代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
/*
* SO2 - Block device drivers lab (#7)
* Linux - Exercise #1, #2, #3, #6 (RAM Disk)
*/

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>

#include <linux/genhd.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/blk_types.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/bio.h>
#include <linux/vmalloc.h>

MODULE_DESCRIPTION("Simple RAM Disk");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");


#define KERN_LOG_LEVEL KERN_ALERT

#define MY_BLOCK_MAJOR 240
#define MY_BLKDEV_NAME "mybdev"
#define MY_BLOCK_MINORS 1
#define NR_SECTORS 128

#define KERNEL_SECTOR_SIZE 512

/* TODO 6: use bios for read/write requests */
#define USE_BIO_TRANSFER 0


static struct my_block_dev {
struct blk_mq_tag_set tag_set;
struct request_queue *queue;
struct gendisk *gd;
u8 *data;
size_t size;
} g_dev;

static int my_block_open(struct block_device *bdev, fmode_t mode)
{
return 0;
}

static void my_block_release(struct gendisk *gd, fmode_t mode)
{
}

static const struct block_device_operations my_block_ops = {
.owner = THIS_MODULE,
.open = my_block_open,
.release = my_block_release
};

static void my_block_transfer(struct my_block_dev *dev, sector_t sector,
unsigned long len, char *buffer, int dir)
{
unsigned long offset = sector * KERNEL_SECTOR_SIZE;

/* check for read/write beyond end of block device */
if ((offset + len) > dev->size)
return;

/* TODO 3: read/write to dev buffer depending on dir */
if(dir == 1){
memcpy(dev->data + offset,buffer,len);
}
else{
memcpy(buffer,dev->data + offset,len);
}
}

/* to transfer data using bio structures enable USE_BIO_TRANFER */
#if USE_BIO_TRANSFER == 1
static void my_xfer_request(struct my_block_dev *dev, struct request *req)
{
/* TODO 6: iterate segments */
struct bio_vec bvec;
struct req_iterator iter;
/* TODO 6: copy bio data to device buffer */
rq_for_each_segment(bvec,req,iter){
sector_t sector = iter.iter.bi_sector;
unsigned long offset = bvec.bv_offset;
size_t len = bvec.bv_len;
int dir = bio_data_dir(iter.bio);
char *buffer = kmap_atomic(bvec.bv_page);
printk(KERN_LOG_LEVEL "%s: buf %8p offset %lu len %u dir %d\n", __func__, buffer, offset, len, dir);

my_block_transfer(dev, sector, len, buffer + offset, dir);
kunmap_atomic(buffer);
}
}
#endif

static blk_status_t my_block_request(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct request *rq;
struct my_block_dev *dev = hctx->queue->queuedata;

/* TODO 2: get pointer to request */
rq = bd->rq;
/* TODO 2: start request processing. */
blk_mq_start_request(rq);
/* TODO 2: check fs request. Return if passthrough. */
if(blk_rq_is_passthrough(rq)){
printk (KERN_NOTICE "Skip non-fs request\n");
blk_mq_end_request(rq, BLK_STS_IOERR);
goto out;
}
/* TODO 2: print request information */
printk(KERN_LOG_LEVEL
"request received: pos=%llu bytes=%u "
"cur_bytes=%u dir=%c\n",
(unsigned long long) blk_rq_pos(rq),
blk_rq_bytes(rq), blk_rq_cur_bytes(rq),
rq_data_dir(rq) ? 'W' : 'R');


#if USE_BIO_TRANSFER == 1
/* TODO 6: process the request by calling my_xfer_request */
my_xfer_request(dev,rq)
#else
/* TODO 3: process the request by calling my_block_transfer */
my_block_transfer(dev,blk_rq_pos(rq),blk_rq_bytes(rq),bio_data(rq->bio),rq_data_dir(rq));
#endif

/* TODO 2: end request successfully */
blk_mq_end_request(rq, BLK_STS_OK);
out:
return BLK_STS_OK;
}

static struct blk_mq_ops my_queue_ops = {
.queue_rq = my_block_request,
};

static int create_block_device(struct my_block_dev *dev)
{
int err;

dev->size = NR_SECTORS * KERNEL_SECTOR_SIZE;
dev->data = vmalloc(dev->size);
if (dev->data == NULL) {
printk(KERN_ERR "vmalloc: out of memory\n");
err = -ENOMEM;
goto out_vmalloc;
}

/* Initialize tag set. */
dev->tag_set.ops = &my_queue_ops;
dev->tag_set.nr_hw_queues = 1;
dev->tag_set.queue_depth = 128;
dev->tag_set.numa_node = NUMA_NO_NODE;
dev->tag_set.cmd_size = 0;
dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
err = blk_mq_alloc_tag_set(&dev->tag_set);
if (err) {
printk(KERN_ERR "blk_mq_alloc_tag_set: can't allocate tag set\n");
goto out_alloc_tag_set;
}

/* Allocate queue. */
dev->queue = blk_mq_init_queue(&dev->tag_set);
if (IS_ERR(dev->queue)) {
printk(KERN_ERR "blk_mq_init_queue: out of memory\n");
err = -ENOMEM;
goto out_blk_init;
}
blk_queue_logical_block_size(dev->queue, KERNEL_SECTOR_SIZE);
dev->queue->queuedata = dev;

/* initialize the gendisk structure */
dev->gd = alloc_disk(MY_BLOCK_MINORS);
if (!dev->gd) {
printk(KERN_ERR "alloc_disk: failure\n");
err = -ENOMEM;
goto out_alloc_disk;
}

dev->gd->major = MY_BLOCK_MAJOR;
dev->gd->first_minor = 0;
dev->gd->fops = &my_block_ops;
dev->gd->queue = dev->queue;
dev->gd->private_data = dev;
snprintf(dev->gd->disk_name, DISK_NAME_LEN, "myblock");
set_capacity(dev->gd, NR_SECTORS);

add_disk(dev->gd);

return 0;

out_alloc_disk:
blk_cleanup_queue(dev->queue);
out_blk_init:
blk_mq_free_tag_set(&dev->tag_set);
out_alloc_tag_set:
vfree(dev->data);
out_vmalloc:
return err;
}

static int __init my_block_init(void)
{
int err = 0;

/* TODO 1: register block device */
int status = register_blkdev(MY_BLOCK_MAJOR,MY_BLKDEV_NAME);
if(status < 0){
printk(KERN_ERR "unable to register mybdev block device\n");
return -EBUSY;
}
/* TODO 2: create block device using create_block_device */
err = create_block_device(&g_dev);
return 0;

out:
/* TODO 2: unregister block device in case of an error */
unregister_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME);
return err;
}

static void delete_block_device(struct my_block_dev *dev)
{
if (dev->gd) {
del_gendisk(dev->gd);
put_disk(dev->gd);
}

if (dev->queue)
blk_cleanup_queue(dev->queue);
if (dev->tag_set.tags)
blk_mq_free_tag_set(&dev->tag_set);
if (dev->data)
vfree(dev->data);
}

static void __exit my_block_exit(void)
{
/* TODO 2: cleanup block device using delete_block_device */
delete_block_device(&g_dev);
/* TODO 1: unregister block device */
unregister_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME);
}

module_init(my_block_init);
module_exit(my_block_exit);
  • 在提交块 IO 请求时,需要附带3个关键结构体:
    • blk_mq_tag_set 类型的“标记”
    • request_queue 类型的请求队列
    • gendisk 类型的磁盘相关信息
  • 结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
root@qemux86:~/skels/block_device_drivers/1-2-3-6-ram-disk/user# ./ram-disk-test

insmod ../kernel/ram-disk.ko
mknod /dev/myblock b 240 0
mknod: /dev/myblock: File exists
request received: pos=0 bytes=4096 cur_bytes=4096 dir=R
request received: pos=0 bytes=4096 cur_bytes=4096 dir=W
test sector 0 ... passed
request received: pos=0 bytes=4096 cur_bytes=4096 dir=W
test sector 1 ... passed
request received: pos=0 bytes=4096 cur_bytes=4096 dir=W
test sector 2 ... passed
request received: pos=0 bytes=4096 cur_bytes=4096 dir=W
test sector 3 ... passed
request received: pos=0 bytes=4096 cur_bytes=4096 dir=W
test sector 4 ... passed
request received: pos=0 bytes=4096 cur_bytes=4096 dir=W
test sector 5 ... passed
request received: pos=0 bytes=4096 cur_bytes=4096 dir=W
test sector 6 ... passed
request received: pos=0 bytes=4096 cur_bytes=4096 dir=W

Test2 完整代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*
* SO2 Lab - Block device drivers (#7)
* Linux - Exercise #4, #5 (Relay disk - bio)
*/

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>

MODULE_AUTHOR("SO2");
MODULE_DESCRIPTION("Relay disk");
MODULE_LICENSE("GPL");

#define KERN_LOG_LEVEL KERN_ALERT

#define PHYSICAL_DISK_NAME "/dev/vdb"
#define KERNEL_SECTOR_SIZE 512

#define BIO_WRITE_MESSAGE "def"

/* pointer to physical device structure */
static struct block_device *phys_bdev;

static void send_test_bio(struct block_device *bdev, int dir)
{
struct bio *bio = bio_alloc(GFP_NOIO, 1);
struct page *page;
char *buf;

/* TODO 4: fill bio (bdev, sector, direction) */
bio->bi_disk = bdev->bd_disk;
bio->bi_iter.bi_sector = 0;
bio->bi_opf = dir;
page = alloc_page(GFP_NOIO);
bio_add_page(bio, page, KERNEL_SECTOR_SIZE, 0);

/* TODO 5: write message to bio buffer if direction is write */
if (dir == REQ_OP_WRITE) {
buf = kmap_atomic(page);
memcpy(buf, BIO_WRITE_MESSAGE, strlen(BIO_WRITE_MESSAGE));
kunmap_atomic(buf);
}
/* TODO 4: submit bio and wait for completion */
printk(KERN_LOG_LEVEL "[send_test_bio] Submiting bio\n");
submit_bio_wait(bio);
printk(KERN_LOG_LEVEL "[send_test_bio] Done bio\n");
/* TODO 4: read data (first 3 bytes) from bio buffer and print it */
buf = kmap_atomic(page);
printk(KERN_LOG_LEVEL "read %02x %02x %02x\n", buf[0], buf[1], buf[2]);
kunmap_atomic(buf);

bio_put(bio);
__free_page(page);
}

static struct block_device *open_disk(char *name)
{
struct block_device *bdev;

/* TODO 4: get block device in exclusive mode */
bdev = blkdev_get_by_path(name,FMODE_READ | FMODE_WRITE | FMODE_EXCL,THIS_MODULE);
return bdev;
}

static int __init relay_init(void)
{
phys_bdev = open_disk(PHYSICAL_DISK_NAME);
if (phys_bdev == NULL) {
printk(KERN_ERR "[relay_init] No such device\n");
return -EINVAL;
}
send_test_bio(phys_bdev, REQ_OP_READ);
return 0;
}

static void close_disk(struct block_device *bdev)
{
/* TODO 4: put block device */
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}

static void __exit relay_exit(void)
{
/* TODO 5: send test write bio */
send_test_bio(phys_bdev, REQ_OP_WRITE);
close_disk(phys_bdev);
}

module_init(relay_init);
module_exit(relay_exit);
  • PS:对结构体 bio 的操作必须包裹在 kmap_atomickunmap_atomic 之间
  • 结果:
1
2
3
4
5
6
7
8
root@qemux86:~/skels/block_device_drivers/4-5-relay-disk# insmod relay-disk.ko 
[send_test_bio] Submiting bio
[send_test_bio] Done bio
read 64 65 66
root@qemux86:~/skels/block_device_drivers/4-5-relay-disk# rmmod relay-disk.ko
[send_test_bio] Submiting bio
[send_test_bio] Done bio
read 64 65 66

Deferred work

实验目标:

  • 了解延迟的工作(即计划在以后执行的代码)
  • 用延迟工作的常见任务的实现
  • 了解延迟工作的同步特性

延迟工作是一类内核工具,它允许人们计划代码在以后的计时器上执行,此计划代码可以在进程上下文中运行,也可以在中断上下文中运行,具体取决于延迟工作的类型

延迟工作用于补充中断处理程序功能,因为中断具有重要的要求和限制:

  • 中断处理程序的执行时间必须尽可能短
  • 在中断上下文中,我们不能使用阻塞调用

使用延迟工作 Deferred work,我们可以在中断处理程序中执行所需的最少工作,并安排中断处理程序的异步操作在以后运行并执行其余操作

  • 在中断上下文中运行的延迟工作也称为下半部分,因为它的目的是从中断处理程序(上半部分)执行其余操作

有三种典型操作可用于所有类型的延迟工作:

  • Initialization 初始化:每种类型都由一个结构描述,该结构的字段必须初始化,此时还设置了要计划的处理程序
  • Scheduling 调度:计划处理程序的执行尽快(或在超时到期后)
  • Masking or Canceling 掩蔽/取消:禁用处理程序的执行,此操作可以是同步的(这保证了处理程序在取消完成后不会运行)或异步的

延迟工作 Deferred work 的主要类型是内核线程和软件:

  • 工作队列在内核线程之上实现,任务集和计时器在软件线程之上实现
  • 下半部分处理程序是 Linux 中延迟工作的第一个实现(但与此同时,它被软中断 softirqs 所取代,基于 softirqs 又诞生了可以动态分配的 Tasklets)
  • 这就是为什么所呈现的某些函数在其名称中包含 bh(下半部分 bottom half)的原因

Softirqs

软中断 Softirqs,软中断是在编译期间静态分配的,因此不能由设备驱动使用,它们是为各种内核子系统保留的

因此,在编译时定义了固定数量的 softirq,对于当前的内核版本,我们定义了以下类型:

1
2
3
4
5
6
7
8
9
10
11
12
13
enum {
HI_SOFTIRQ = 0, /* 运行tasklets */
TIMER_SOFTIRQ, /* 运行timers */
NET_TX_SOFTIRQ, /* 由网络子系统使用 */
NET_RX_SOFTIRQ, /* 由网络子系统使用 */
BLOCK_SOFTIRQ, /* 由IO子系统使用 */
IRQ_POLL_SOFTIRQ,
TASKLET_SOFTIRQ, /* 运行tasklets */
SCHED_SOFTIRQ, /* 负载平衡 */
HRTIMER_SOFTIRQ, /* 实现高精度计时器 */
RCU_SOFTIRQ, /* 实施区域控制单元类型机制 */
NR_SOFTIRQS
};
  • 不同类型的 Softirqs 有不同的作用(部分类型会在后面讲到)

Softirqs 在中断上下文中运行,这意味着它们不能调用阻塞函数,如果 sofitrq 处理程序需要调用此类函数,则可以安排工作队列 work queues 来执行这些阻塞调用

Tasklets

任务集 Tasklets 是一种特殊形式的延迟工作,在中断上下文中运行,类似于 softirqs,两者主要区别在于:

  • Tasklets 可以动态分配,因此它们可以由设备驱动程序使用
  • Tasklets 由 struct tasklet 和许多其他内核结构表示,在使用之前需要对其进行初始化
1
2
3
4
void handler(unsigned long data);

DECLARE_TASKLET(tasklet, handler, data);
DECLARE_TASKLET_DISABLED(tasklet, handler, data);

如果我们想手动初始化 Tasklets,我们可以使用以下方法:

1
2
3
4
void handler(unsigned long data);

struct tasklet_struct tasklet;
tasklet_init(&tasklet, handler, data);

针对运行的 Tasklets 编程称为调度 scheduling,Tasklets 调度通过以下方式完成:

1
2
void tasklet_schedule(struct tasklet_struct *tasklet);
void tasklet_hi_schedule(struct tasklet_struct *tasklet);

可以通过以下的函数屏蔽 Tasklets:

1
2
void tasklet_enable(struct tasklet_struct * tasklet);
void tasklet_disable(struct tasklet_struct * tasklet);
  • PS:由于 Tasklets 是从 softirqs 运行的,因此无法在处理程序函数中使用阻塞调用

Timers

计时器 Timers 一种特殊类型的延迟工作:

  • 由结构定义 timer_list
  • 在中断上下文中运行,并在软件之上实现

由于 Timers 必须是原子的,所以 Timers 运行在原子上下文中,内核不能访问用户空间,而且内核是不能休眠或调度(其实 Timers 也是用 Softirqs 实现的,当然不能休眠或调度)

注册好的 Timers 是由内核子模块组织并存储的,其底层的核心功能“定时”是由内核实现,Timers 需要提供一个回调函数,该回调函数和定时器注册程序使用的是同一个线程(这也是在删除模块之前,必须停止计时器的原因)

要使用 Timers,必须首先调用如下函数:

1
2
3
4
5
#include <linux/sched.h>

void timer_setup(struct timer_list * timer,
void (*function)(struct timer_list *),
unsigned int flags);
  • 用于初始化 timer_list 的内部字段,并将函数关联为计时器处理程序
1
2
3
4
5
6
7
8
9
10
11
12
13
14
struct timer_list {
/*
* All fields that change during normal runtime grouped to the
* same cacheline
*/
struct hlist_node entry;
unsigned long expires;
void (*function)(struct timer_list *);
u32 flags;

#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
};
  • expires:运行处理程序函数 (*function)(struct timer_list *) 的时间
  • function:回调函数

使用计时器的一个常见错误是忘记关闭计时器:

  • 在删除模块之前,我们必须停止计时器
1
2
del_timer_sync(struct timer_list * timer); /* 摘除一个定时器对象,确保当函数返回时系统中没有任何处理器正在执行定时器对象上的定时器函数 */
del_timer(struct timer_list * timer); /* 从系统的定时器管理队列中摘除一个定时器对象 */
  • 如果计时器在模块被删除后过期,则处理程序函数将不再加载到内核中,并且将生成内核 oops

定时器的使用案例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#include <linux/sched.h>

static struct my_device_data {
struct timer_list timer;
......
} dev;


void timer_function(struct timer_list * tl){
struct my_device_data *my_data = from_timer(my_data, tl, timer);
.....
mod_timer(&my_data->timer, jiffies + seconds * HZ);
}

static void my_access(void){
unsigned long seconds = 1;

timer_setup(&dev.timer, timer_function, 0);
mod_timer(&dev.timer, jiffies + seconds * HZ);
}

其中的 from_timer 用于从 timer_list 中获取父结构体 my_device_data

1
2
#define from_timer(var, callback_timer, timer_fieldname) \
container_of(callback_timer, typeof(*var), timer_fieldname)

Locking

为了在进程上下文(简称 “A”)中运行的代码和在 softirq 上下文(简称 “B”)中运行的代码之间保持同步,我们需要使用特殊的锁定基元:

  • 必须在 A 中使用 “停用当前处理器上的下半部分处理程序” 的自旋锁操作
  • 并且在 B 中仅使用基本的自旋锁操作

使用自旋锁可以确保我们不会在多个CPU之间进行 Race 条件竞争,而停用 softirq 可以确保我们不会在已经获得自旋锁的同一 CPU 上安排 softirq look 时出现死锁

1
2
3
4
5
6
7
8
9
10
11
12
static void my_access(void){
spin_lock_bh(&my_data->lock);
......
spin_unlock_bh(&my_data->lock);
}

static void timer_handler(struct timer_list *tl){
spin_lock(&my_data->lock);
......
spin_unlock(&my_data->lock);
mod_timer(&my_data->timer, jiffies + HZ);
}
  • 定期器的回调函数在任何时候都可能会发生(内核会直接抢占原来的进程,从而执行回调函数)
  • 如果在 my_access 中拿了自旋锁之后被 timer_handler 抢占,就会发生死锁(回调函数和定时器注册程序使用的是同一个线程)
  • 如果不在 timer_handler 中加锁,又可能会破坏共享数据(例如:在引用指针之前置空了指针)
  • 因此需要使用 spin_lock_bh 在加锁的同时禁止中断下半部(多指软中断)

我们可以使用如下函数来 禁用/启用 softirqs 处理程序:(并且由于它们运行在 softirqs 之上,所以计时器和任务集也更少)

1
2
3
4
void local_bh_disable(void); /* 禁用softirqs处理程序 */
void local_bh_enable(void); /* 启用softirqs处理程序 */
void spin_lock_bh(spinlock_t *lock); /* 禁用softirqs处理程序,添加自旋锁 */
void spin_unlock_bh(spinlock_t *lock); /* 启用softirqs处理程序,释放自旋锁 */

Workqueues

工作队列 Workqueues 用于计划在流程上下文中运行的操作,其实就是一个用于创建内核线程的接口,通过它可以创建一个“工作者线程”来专门处理中断的下半部工作

  • Workqueues 的优点就是可以使用独立于原进程的内核线程
  • 如果想要在中断处理程序中进行调度,就必须使用 Workqueues

它们工作的基本单位称为工作项 Work items,有两种类型的工作:

  • struct work_struct:它计划稍后运行的任务
  • struct delayed_work:它计划任务在至少给定的时间间隔后运行

在使用工作项之前,必须初始化工作项,可以使用两种类型的宏:

  • 一种是同时声明和初始化工作项 Work items
  • 另一种是仅初始化工作项 Work items
1
2
3
4
5
6
7
#include <linux/workqueue.h>

DECLARE_WORK(name , void (*function)(struct work_struct *));
DECLARE_DELAYED_WORK(name, void(*function)(struct work_struct *));

INIT_WORK(struct work_struct *work, void(*function)(struct work_struct *));
INIT_DELAYED_WORK(struct delayed_work *work, void(*function)(struct work_struct *));

一旦声明和初始化工作队列,我们可以使用以下函数来调度任务:

1
2
bool schedule_work(struct work_struct *work);
bool schedule_delayed_work(struct delayed_work *work, unsigned long delay);

可以用如下函数取消工作项:

1
2
int cancel_work_sync(struct delayed_work *work);
int cancel_delayed_work_sync(struct delayed_work *work);

可以通过以下方式等待工作队列完成运行其所有工作项:

1
void flush_scheduled_work(void);

当然以上这些函数都是针对某个工作项 Work items 而言

工作队列由结构 workqueue_struct 表示,可以使用以下这些函数创建新的工作队列:

1
2
struct workqueue_struct *create_workqueue(const char *name);
struct workqueue_struct *create_singlethread_workqueue(const char *name);

要在新队列中添加任务(某个工作项),使用如下函数:

1
2
3
4
int queue_work(struct workqueue_struct * queue, struct work_struct *work);

int queue_delayed_work(struct workqueue_struct *queue,
struct delayed_work * work , unsigned long delay);

等待所有工作项完成调用:

1
void flush_workqueue(struct worksqueue_struct * queue);

销毁工作队列:

1
void destroy_workqueue(struct workqueue_struct *queue);

Kernel threads

内核线程是工作队列机制的基础,从本质上讲,内核线程是仅在内核模式下运行的线程,没有用户地址空间或其他用户属性

创建内核线程的函数:(但是不会直接运行)

1
2
3
4
#include <linux/kthread.h>

struct task_struct *kthread_create(int (*threadfn)(void *data),
void *data, const char namefmt[], ...);
  • threadfn:将由内核线程运行的函数
  • data:要发送到函数的参数
  • namefmt:表示内核线程名称

启动内核线程:

1
int wake_up_process(struct task_struct *p);

创建和运行内核线程:

1
2
struct task_struct * kthread_run(int (*threadfn)(void *data)
void *data, const char namefmt[], ...);

内核线程的终止是在内核线程中运行的函数中自愿完成的,通过调用如下函数:

1
fastcall NORET_TYPE void do_exit(long code);

内核线程处理程序的大多数实现都使用相同的模型,建议开始使用相同的模型以避免常见错误:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#include <linux/kthread.h>

DECLARE_WAIT_QUEUE_HEAD(wq); /* 初始化等待队列 */

// list events to be processed by kernel thread
struct list_head events_list;
struct spin_lock events_lock;

// structure describing the event to be processed
struct event {
struct list_head lh;
bool stop;
//...
};

struct event* get_next_event(void)
{
struct event *e;
spin_lock(&events_lock);
e = list_first_entry(&events_list, struct event*, lh);
if (e)
list_del(&e->lh);
spin_unlock(&events_lock);
return e
}

int my_thread_f(void *data)
{
struct event *e;
while (true) {
wait_event(wq, (e = get_next_event));
/* Event processing */
if (e->stop)
break;
}
do_exit(0);
}

/* start and start kthread */
kthread_run(my_thread_f, NULL, "%skthread%d", "my", 0);

使用上面的模板,内核线程请求可以通过以下方式发出:

1
2
3
4
5
6
7
void send_event(struct event *ev)
{
spin_lock(&events_lock);
list_add(&ev->lh, &events_list);
spin_unlock(&events_lock);
wake_up(&wq);
}

Exercises

要解决练习,您需要执行以下步骤:

  • 从模板准备 skeletons
  • 构建模块
  • 将模块复制到虚拟机
  • 启动 VM 并在 VM 中测试模块
1
2
3
LABS=deferred_work make skels
make build
make copy

1.Timer 完整代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
/*
* Deferred Work
*
* Exercise #1, #2: simple timer
*/

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sched.h>

MODULE_DESCRIPTION("Simple kernel timer");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

#define TIMER_TIMEOUT 1

static struct timer_list timer;

static void timer_handler(struct timer_list *tl)
{
/* TODO 1: print a message */
static int message = 0;
pr_info("message :%d\n",message++);
/* TODO 2: rechedule timer */
mod_timer(&timer, jiffies + TIMER_TIMEOUT * HZ);
}

static int __init timer_init(void)
{
pr_info("[timer_init] Init module\n");

/* TODO 1: initialize timer */
timer_setup(&timer, timer_handler, 0);
/* TODO 1: schedule timer for the first time */
mod_timer(&timer, jiffies + TIMER_TIMEOUT * HZ);

return 0;
}

static void __exit timer_exit(void)
{
pr_info("[timer_exit] Exit module\n");
/* TODO 1: cleanup; make sure the timer is not running after we exit */
del_timer_sync(&timer);
}

module_init(timer_init);
module_exit(timer_exit);
  • 结果:
1
2
3
4
5
6
7
8
root@qemux86:~/skels/deferred_work/1-2-timer# insmod timer.ko                   
timer: loading out-of-tree module taints kernel.
[timer_init] Init module
root@qemux86:~/skels/deferred_work/1-2-timer# message :0
message :1
message :2
message :3
message :4
  • 在定时器处理程序中还可以调用定时器,这样就可以实现定时循环

2.Deferred 完整代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
/*
* SO2 - Lab 6 - Deferred Work
*
* Exercises #3, #4, #5: deferred work
*
* Code skeleton.
*/

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/sched/task.h>
#include "../include/deferred.h"

#define MY_MAJOR 42
#define MY_MINOR 0
#define MODULE_NAME "deferred"

#define TIMER_TYPE_NONE -1
#define TIMER_TYPE_SET 0
#define TIMER_TYPE_ALLOC 1
#define TIMER_TYPE_MON 2

MODULE_DESCRIPTION("Deferred work character device");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

struct mon_proc {
struct task_struct *task;
struct list_head list;
};

static struct my_device_data {
struct cdev cdev;
/* TODO 1: add timer */
struct timer_list timer;
/* TODO 2: add flag */
unsigned int flag;
/* TODO 3: add work */
struct work_struct work;
/* TODO 4: add list for monitored processes */
struct list_head list;
/* TODO 4: add spinlock to protect list */
spinlock_t lock;
} dev;

static void alloc_io(void)
{
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(5 * HZ);
pr_info("Yawn! I've been sleeping for 5 seconds.\n");
}

static struct mon_proc *get_proc(pid_t pid)
{
struct task_struct *task;
struct mon_proc *p;

rcu_read_lock();
task = pid_task(find_vpid(pid), PIDTYPE_PID);
rcu_read_unlock();
if (!task)
return ERR_PTR(-ESRCH);

p = kmalloc(sizeof(*p), GFP_ATOMIC);
if (!p)
return ERR_PTR(-ENOMEM);

get_task_struct(task);
p->task = task;

return p;
}


/* TODO 3: define work handler */
static void work_handler(struct work_struct *work)
{
alloc_io();
}

#define ALLOC_IO_DIRECT
/* TODO 3: undef ALLOC_IO_DIRECT*/
#undef ALLOC_IO_DIRECT

static void timer_handler(struct timer_list *tl)
{
/* TODO 1: implement timer handler */
struct my_device_data *my_data = from_timer(my_data, tl, timer);
pr_info("[timer_handler] pid = %d, comm = %s\n",
current->pid, current->comm);
/* TODO 2: check flags: TIMER_TYPE_SET or TIMER_TYPE_ALLOC */
switch (my_data->flag) {
case TIMER_TYPE_SET:
break;
case TIMER_TYPE_ALLOC:
// alloc_io();
/* TODO 3: schedule work */
schedule_work(&my_data->work);
break;
/* TODO 4: iterate the list and check the proccess state */
case TIMER_TYPE_MON:
{
struct mon_proc *p, *n;
spin_lock(&my_data->lock);
/* TODO 4: if task is dead print info ... */
/* TODO 4: ... decrement task usage counter ... */
/* TODO 4: ... remove it from the list ... */
/* TODO 4: ... free the struct mon_proc */
list_for_each_entry_safe(p, n, &my_data->list, list){
if (p->task->state == TASK_DEAD) {
pr_info("task %s (%d) is dead\n", p->task->comm,
p->task->pid);
put_task_struct(p->task);
list_del(&p->list);
kfree(p);
}
}
spin_unlock(&my_data->lock);

mod_timer(&my_data->timer, jiffies + HZ);
break;
}
}
}

static int deferred_open(struct inode *inode, struct file *file)
{
struct my_device_data *my_data =
container_of(inode->i_cdev, struct my_device_data, cdev);
file->private_data = my_data;
pr_info("[deferred_open] Device opened\n");
return 0;
}

static int deferred_release(struct inode *inode, struct file *file)
{
pr_info("[deferred_release] Device released\n");
return 0;
}

static long deferred_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct my_device_data *my_data = (struct my_device_data*) file->private_data;

pr_info("[deferred_ioctl] Command: %s\n", ioctl_command_to_string(cmd));

switch (cmd) {
case MY_IOCTL_TIMER_SET:
/* TODO 2: set flag */
my_data->flag = TIMER_TYPE_SET;
/* TODO 1: schedule timer */
mod_timer(&dev.timer,jiffies + arg * HZ);
break;
case MY_IOCTL_TIMER_CANCEL:
/* TODO 1: cancel timer */
del_timer_sync(&dev.timer);
break;
case MY_IOCTL_TIMER_ALLOC:
/* TODO 2: set flag and schedule timer */
my_data->flag = TIMER_TYPE_ALLOC;
mod_timer(&dev.timer,jiffies + arg * HZ);
break;
case MY_IOCTL_TIMER_MON:
{
/* TODO 4: use get_proc() and add task to list */
struct mon_proc *p = get_proc(current->pid);
/* TODO 4: protect access to list */
spin_lock_bh(&my_data->lock);
list_add(&p->list,&my_data->list);
spin_unlock_bh(&my_data->lock);
/* TODO 4: set flag and schedule timer */
my_data->flag = TIMER_TYPE_MON;
mod_timer(&my_data->timer,jiffies + arg * HZ);
break;
}
default:
return -ENOTTY;
}
return 0;
}

struct file_operations my_fops = {
.owner = THIS_MODULE,
.open = deferred_open,
.release = deferred_release,
.unlocked_ioctl = deferred_ioctl,
};

static int deferred_init(void)
{
int err;

pr_info("[deferred_init] Init module\n");
err = register_chrdev_region(MKDEV(MY_MAJOR, MY_MINOR), 1, MODULE_NAME);
if (err) {
pr_info("[deffered_init] register_chrdev_region: %d\n", err);
return err;
}

/* TODO 2: Initialize flag. */
dev.flag = TIMER_TYPE_NONE;
/* TODO 3: Initialize work. */
INIT_WORK(&dev.work, work_handler);
/* TODO 4: Initialize lock and list. */
spin_lock_init(&dev.lock);
INIT_LIST_HEAD(&dev.list);

cdev_init(&dev.cdev, &my_fops);
cdev_add(&dev.cdev, MKDEV(MY_MAJOR, MY_MINOR), 1);

/* TODO 1: Initialize timer. */
timer_setup(&dev.timer,timer_handler,0);
return 0;
}

static void deferred_exit(void)
{
struct mon_proc *p, *n;

pr_info("[deferred_exit] Exit module\n" );

cdev_del(&dev.cdev);
unregister_chrdev_region(MKDEV(MY_MAJOR, MY_MINOR), 1);

/* TODO 1: Cleanup: make sure the timer is not running after exiting. */
del_timer_sync(&dev.timer);
/* TODO 3: Cleanup: make sure the work handler is not scheduled. */
flush_scheduled_work();
/* TODO 4: Cleanup the monitered process list */
list_for_each_entry_safe(p, n, &dev.list, list) {
/* TODO 4: ... decrement task usage counter ... */
/* TODO 4: ... remove it from the list ... */
/* TODO 4: ... free the struct mon_proc */
put_task_struct(p->task);
list_del(&p->list);
kfree(p);
}
}

module_init(deferred_init);
module_exit(deferred_exit);
  • 结果:
1
2
3
4
5
6
7
8
9
10
11
root@qemux86:~/skels/deferred_work/3-4-5-deferred# insmod ./kernel/deferred.ko  
deferred: loading out-of-tree module taints kernel.
[deferred_init] Init module
root@qemux86:~/skels/deferred_work/3-4-5-deferred# mknod /dev/deferred c 42 0
root@qemux86:~/skels/deferred_work/3-4-5-deferred# ./user/test a 3
[deferred_open] Device opened
Allocate memory after 3 seconds
[deferred_ioctl] Command: Allocate memory
[deferred_release] Device released
root@qemux86:~/skels/deferred_work/3-4-5-deferred# [timer_handler] pid = 0, com0
BUG: scheduling while atomic: swapper/0/0/0x00000102
  • 驱动程序导致错误,因为在原子上下文中调用了阻塞函数 alloc_io(计时器处理程序运行中断上下文)
  • 但是 schedule_work(&my_data->work) 中,使用工作队列新开一个内核线程来运行 alloc_io 就不会报错(这也是工作队列的优势)
1
2
3
4
5
6
7
8
9
root@qemux86:~/skels/deferred_work/3-4-5-deferred# ./user/test p 3              
[deferred_open] Device opened
Monitor PID 3.
[deferred_ioctl] Command: Monitor pid
[deferred_release] Device released
root@qemux86:~/skels/deferred_work/3-4-5-deferred# [timer_handler] pid = 0, com0
task test (239) is dead
[timer_handler] pid = 0, comm = swapper/0
[timer_handler] pid = 0, comm = swapper/0
  • 正常打印进程信息

3.Kthread 完整代码:

  • 实现一个简单的模块,该模块会创建用于显示当前进程标识符的内核线程
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/*
* SO2 - Lab 6 - Deferred Work
*
* Exercise #6: kernel thread
*/

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/kthread.h>

MODULE_DESCRIPTION("Simple kernel thread");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

wait_queue_head_t wq_stop_thread;
atomic_t flag_stop_thread;
wait_queue_head_t wq_thread_terminated;
atomic_t flag_thread_terminated;

int my_thread_f(void *data)
{
pr_info("[my_thread_f] Current process id is %d (%s)\n",
current->pid, current->comm);
/* TODO: Wait for command to remove module on wq_stop_thread queue. */
wait_event_interruptible(wq_stop_thread,atomic_read(&flag_stop_thread)!=0);
/* TODO: set flag to mark kernel thread termination */
atomic_set(&flag_thread_terminated,1);
/* TODO: notify the unload process that we have exited */
wake_up_interruptible(&wq_thread_terminated);
pr_info("[my_thread_f] Exiting\n");
do_exit(0);
}

static int __init kthread_init(void)
{
pr_info("[kthread_init] Init module\n");
/* TODO: init the waitqueues and flags */
init_waitqueue_head(&wq_stop_thread);
init_waitqueue_head(&wq_thread_terminated);
atomic_set(&flag_stop_thread,0);
atomic_set(&flag_thread_terminated,0);
/* TODO: create and start the kernel thread */
struct task_struct * kt = kthread_create(my_thread_f,NULL,"yhellow");
wake_up_process(kt);

return 0;
}

static void __exit kthread_exit(void)
{
/* TODO: notify the kernel thread that its time to exit */
atomic_set(&flag_stop_thread,1);
wake_up_interruptible(&wq_stop_thread);
/* TODO: wait for the kernel thread to exit */
wait_event_interruptible(wq_thread_terminated,atomic_read(&flag_thread_terminated) != 0);
pr_info("[kthread_exit] Exit module\n");
}

module_init(kthread_init);
module_exit(kthread_exit);
  • 结果:
1
2
3
4
5
6
7
root@qemux86:~/skels/deferred_work/6-kthread# insmod kthread.ko                 
kthread: loading out-of-tree module taints kernel.
[kthread_init] Init module
[my_thread_f] Current process id is 258 (yhellow)
root@qemux86:~/skels/deferred_work/6-kthread# rmmod kthread.ko
[my_thread_f] Exiting
[kthread_exit] Exit modul
  • 必须等到内核线程停止以后,内核模块才可以停止
  • 由于内核线程和内核模块是同时运行的,为了使内核模块先停止,做了以下操作:
    • kthread_exit 执行之前,内核线程会因为 flag_stop_thread = 0 被添加到等待队列中
    • kthread_exit 执行的过程中设置 flag_stop_thread = 1,同时唤醒内核线程(如果不设置 flag_stop_thread = 1 而直接唤醒,就会导致死锁),然后因为 flag_thread_terminated = 0 被添加到等待队列中
    • my_thread_f 中设置 flag_thread_terminated = 1,同时唤醒内核模块
    • 由于唤醒需要时间:内核线程先关闭,内核模块后关闭
  • 本实验只涉及的“单个内核线程”的情况,如果需要处理多个内核线程,则可以使用之前给出的模板

IO access and Interrupts

实验目的:

  • 与外围设备通信
  • 实现中断处理程序
  • 将中断与进程上下文同步

外围设备通过 Read/Write 其寄存器进行控制:

  • 通常,设备具有多个寄存器,可以在内存地址空间或 I/O 地址空间中的连续地址访问这些寄存器
  • 连接到 I/O 总线的每个设备都有一组 I/O 地址,称为 I/O 端口
  • I/O 端口可以映射到物理内存地址,以便处理器可以通过直接与内存配合使用的指令与设备通信
  • 为简单起见,我们将直接使用 I/O 端口(不映射到物理内存地址)与物理设备进行通信

每个器件的 I/O 端口被结构化为一组专用寄存器,以提供统一的编程接口,大多数设备将具有以下类型的寄存器:

  • Control registers:接收设备命令
  • Status registers:包含有关设备内部状态的信息
  • Input registers:从设备中获取数据 - Read
  • Output registers:在其中写入数据并传输给设备 - Write

Accessing the hardware

在 Linux 中,I/O 端口访问在所有体系结构上实现,并且可以使用多个 API

在访问 I/O 端口之前,首先必须请求访问它们,以确保只有一个用户在使用:

1
2
3
4
#include <linux/ioport.h>

struct resource *request_region(unsigned long first, unsigned long n,
const char *name);
  • first:IO 端口的基地址
  • n:IO 端口占用的范围
  • name:使用这段 IO 地址的设备名

要释放保留区域 resource,必须使用以下函数:

1
void release_region(unsigned long start, unsigned long n);

使用案例如下:

1
2
3
4
5
6
7
8
9
10
11
#include <linux/ioport.h>

#define MY_BASEPORT 0x3F8
#define MY_NR_PORTS 8

if (!request_region(MY_BASEPORT, MY_NR_PORTS, "com1")) {
/* handle error */
return -ENODEV;
}

release_region(MY_BASEPORT, MY_NR_PORTS);

所有端口请求都可以通过文件从用户空间看到:/proc/ioports

1
2
3
4
5
6
7
8
9
10
11
12
root@qemux86:~# cat /proc/ioports                                               
0000-0cf7 : PCI Bus 0000:00
0000-001f : dma1
0020-0021 : pic1
0040-0043 : timer0
0050-0053 : timer1
0060-0060 : keyboard
0064-0064 : keyboard
0080-008f : dma page reg
00a0-00a1 : pic2
00c0-00df : dma2
00f0-00ff : fpu

驱动程序获得所需的 I/O 端口范围后,可以在这些端口上执行读取或写入操作:

1
2
3
4
5
6
7
unsigned inb(int port); /* reads one byte (8 bits) from port */
unsigned inw(int port); /* reads two bytes (16-bit) from ports */
unsigned inl (int port); /* reads four bytes (32-bits) from port */

void outb(unsigned char byte, int port); /* writes one byte (8 bits) to port */
void outw(unsigned short word, int port); /* writes two bytes (16-bits) to port */
void outl(unsigned long word, int port); /* writes four bytes (32-bits) to port */
  • 读取出来的字符并不是 ASCII,而是注册表值 scancode
  • 我们只需要在按下时选择代码,然后解码 ASCII 字符
  • PS:键盘 “按下时” 和 “松开时” 是两个不同的 scancode,后面的 is_key_press 用于展示这个特点

Interrupt handling

与其他资源一样,驱动程序必须先访问 Interrupt handling 中断处理程序,然后才能使用它,并在执行结束时释放它:

1
2
3
4
5
6
7
8
#include <linux/interrupt.h>

typedef irqreturn_t (*irq_handler_t)(int, void *);

int request_irq(unsigned int irq_no, irq_handler_t handler,
unsigned long flags, const char *dev_name, void *dev_id);

void free_irq(unsigned int irq_no, void *dev_id);
  • 中断处理程序函数在中断上下文中执行,这意味着无法调用阻塞 API
  • 必须避免在中断处理程序中执行大量工作,而是在需要时使用延迟工作

中断处理程序函数的签名:

1
irqreturn_t (*handler)(int irq_no, void *dev_id);
  • irq_no:中断编号
  • irqerturn_t:标识返回信息
    • IRQ_NONE:中断不适用于此设备(共享中断)
    • IRQ_HANDLED:中断可以直接在中断上下文中处理
    • IRQ_WAKE_THREAD:计划进程上下文处理函数的运行

实例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#include <linux/interrupt.h>

#define MY_BASEPORT 0x3F8
#define MY_IRQ 4

irqreturn_t my_handler(int irq_no, void *dev_id)
{
struct my_device_data *my_data = (struct my_device_data *) dev_id;
/* if interrupt is not for this device (shared interrupts) */
/* return IRQ_NONE;*/

/* clear interrupt-pending bit */
/* read from device or write to device*/
return IRQ_HANDLED;
}

static my_init(void)
{
[...]
struct my_device_data *my_data;
int err;

err = request_irq(MY_IRQ, my_handler, IRQF_SHARED,
"com1", my_data);
if (err < 0) {
/* handle error*/
return err;
}
[...]
}

有关系统中断的信息和统计信息可以在 /proc/interrupt/proc/stat 中找到

1
2
3
4
5
6
7
8
root@qemux86:~# cat /proc/interrupts                                            
CPU0
0: 71 IO-APIC 2-edge timer
1: 9 IO-APIC 1-edge i8042
9: 0 IO-APIC 9-fasteoi acpi
10: 403 IO-APIC 10-fasteoi virtio1, virtio2, virtio5
11: 22 IO-APIC 11-fasteoi virtio3, virtio4, virtio0
12: 125 IO-APIC 12-edge i8042

Locking

由于中断处理程序在中断上下文中运行,因此可以执行的操作受到限制:

  • 无法访问用户空间内存
  • 无法调用阻塞函数,因此不能使用互斥锁(中断发生时,程序会把当前进程的上下文保存到内核栈上,称为中断帧,如果在中断中发生阻塞,schedule 新调用的进程很可能会破坏中断帧),其实这是为了实现中断嵌套所付出的代价
  • 使用自旋锁进行同步也很棘手(如果所使用的自旋锁,已被正在运行的处理程序中断的进程获取,则可能导致死锁)

在某些情况下,设备驱动程序必须使用中断进行同步(例如,当数据在中断处理程序和进程上下文或下半部分处理程序之间共享时),在这些情况下,有必要停用中断并使用自旋锁:

1
2
3
4
5
void spin_lock_irqsave (spinlock_t * lock, unsigned long flags); /* 保存中断的当前状态,禁止本地中断,获取指定的锁 */
void spin_unlock_irqrestore (spinlock_t * lock, unsigned long flags); /* 对指定的锁进行解锁,并恢复到加锁之前的状态 */

void spin_lock_irq (spinlock_t * lock); /* 禁止本地中断,获取指定的锁 */
void spin_unlock_irq (spinlock_t * lock); /* 对指定的锁进行解锁,恢复本地中断 */

为了使用在进程上下文和中断处理例程之间共享的资源,将按如下方式使用上述功能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static spinlock_t lock;

/* IRQ handling routine: interrupt context */
irqreturn_t kbd_interrupt_handle(int irq_no, void * dev_id)
{
...
spin_lock(&lock);
/* Critical region - access shared resource */
spin_unlock (&lock);
...
}

/* Process context: Disable interrupts when locking */
static void my_access(void)
{
unsigned long flags;

spin_lock_irqsave(&lock, flags);
/* Critical region - access shared resource */
spin_unlock_irqrestore(&lock, flags);

...
}

void my_init (void)
{
...
spin_lock_init (&lock);
...
}
  • 因为系统硬中断 kbd_interrupt_handle 在任何时候都可以发生(内核会直接抢占原来的进程,从而执行硬中断)
  • 如果在 my_access 中拿了自旋锁之后被 kbd_interrupt_handle 抢占,就会发生死锁
  • 如果不在 kbd_interrupt_handle 中加锁,又可能会破坏共享数据(例如:在引用指针之前置空了指针)
  • 因此需要使用 spin_lock_irqsave 在加锁的同时禁止硬中断

Exercises

要解决练习,您需要执行以下步骤:

  • 从模板准备 skeletons
  • 构建模块
  • 将模块复制到虚拟机
  • 启动 VM 并在 VM 中测试模块
1
2
3
make clean
LABS=interrupts make skels
make build

直接看完整代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <asm/io.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>

MODULE_DESCRIPTION("KBD");
MODULE_AUTHOR("Kernel Hacker");
MODULE_LICENSE("GPL");

#define MODULE_NAME "kbd"

#define KBD_MAJOR 42
#define KBD_MINOR 0
#define KBD_NR_MINORS 1

#define I8042_KBD_IRQ 1
#define I8042_STATUS_REG 0x64
#define I8042_DATA_REG 0x60

#define BUFFER_SIZE 1024
#define SCANCODE_RELEASED_MASK 0x80

struct kbd {
struct cdev cdev;
/* TODO 3: add spinlock */
spinlock_t lock;
char buf[BUFFER_SIZE];
size_t put_idx, get_idx, count;
} devs[1];

/*
* Checks if scancode corresponds to key press or release.
*/
static int is_key_press(unsigned int scancode)
{
return !(scancode & SCANCODE_RELEASED_MASK);
}

/*
* Return the character of the given scancode.
* Only works for alphanumeric/space/enter; returns '?' for other
* characters.
*/
static int get_ascii(unsigned int scancode)
{
static char *row1 = "1234567890";
static char *row2 = "qwertyuiop";
static char *row3 = "asdfghjkl";
static char *row4 = "zxcvbnm";

scancode &= ~SCANCODE_RELEASED_MASK;
if (scancode >= 0x02 && scancode <= 0x0b)
return *(row1 + scancode - 0x02);
if (scancode >= 0x10 && scancode <= 0x19)
return *(row2 + scancode - 0x10);
if (scancode >= 0x1e && scancode <= 0x26)
return *(row3 + scancode - 0x1e);
if (scancode >= 0x2c && scancode <= 0x32)
return *(row4 + scancode - 0x2c);
if (scancode == 0x39)
return ' ';
if (scancode == 0x1c)
return '\n';
return '?';
}

static void put_char(struct kbd *data, char c)
{
if (data->count >= BUFFER_SIZE)
return;

data->buf[data->put_idx] = c;
data->put_idx = (data->put_idx + 1) % BUFFER_SIZE;
data->count++;
}

static bool get_char(char *c, struct kbd *data)
{
/* TODO 4: get char from buffer; update count and get_idx */
if (data->count > 0){
*c = data->buf[data->get_idx];
data->get_idx = (data->get_idx + 1) % BUFFER_SIZE;
data->count--;
return true;
}

return false;
}

static void reset_buffer(struct kbd *data)
{
/* TODO 5: reset count, put_idx, get_idx */
printk("reset");
while(data->count != 0){
data->buf[data->put_idx] = 0;
data->put_idx = (data->put_idx - 1) % BUFFER_SIZE;
data->count --;
}
}

/*
* Return the value of the DATA register.
*/
static inline u8 i8042_read_data(void)
{
u8 val;
/* TODO 3: Read DATA register (8 bits). */
val = inb(I8042_DATA_REG);
return val;
}

/* TODO 2: implement interrupt handler */
/* TODO 3: read the scancode */
/* TODO 3: interpret the scancode */
/* TODO 3: display information about the keystrokes */
/* TODO 3: store ASCII key to buffer */
irqreturn_t kbd_interrupt_handler(int irq_no,void* dev_id)
{
struct kbd* data = (struct kbd*)dev_id;
u8 scancode = i8042_read_data();
int pressed = is_key_press(scancode);
int key = get_ascii(scancode);

pr_info("IRQ: %d, scancode = 0x%x (%u), pressed = %d, ch = %c\n",
irq_no, scancode, scancode, pressed, key);
if(!pressed){
return IRQ_NONE;
}

spin_lock(&data->lock);
put_char(data,key);
spin_unlock(&data->lock);
return IRQ_NONE;
}

static int kbd_open(struct inode *inode, struct file *file)
{
struct kbd *data = container_of(inode->i_cdev, struct kbd, cdev);

file->private_data = data;
pr_info("%s opened\n", MODULE_NAME);
return 0;
}

static int kbd_release(struct inode *inode, struct file *file)
{
pr_info("%s closed\n", MODULE_NAME);
return 0;
}

/* TODO 5: add write operation and reset the buffer */
static ssize_t kbd_write(struct file *file, const char __user *user_buffer,
size_t size, loff_t *offset)
{
struct kbd *data = (struct kbd*) file->private_data;
unsigned long flag;

spin_lock_irqsave(&data->lock,flag);
reset_buffer(data);
spin_unlock_irqrestore(&data->lock,flag);

return size;
}

static ssize_t kbd_read(struct file *file, char __user *user_buffer,
size_t size, loff_t *offset)
{
struct kbd *data = (struct kbd *) file->private_data;
size_t read = 0;
unsigned long flag;
char ch;
bool more = true;

/* TODO 4: read data from buffer */

while(size--){
spin_lock_irqsave(&data->lock,flag);
more = get_char(&ch,data);
spin_unlock_irqrestore(&data->lock,flag);

if(!more)
break;
if(put_user(ch,user_buffer++))
return -EFAULT;
read++;
}

return read;
}

static const struct file_operations kbd_fops = {
.owner = THIS_MODULE,
.open = kbd_open,
.release = kbd_release,
.read = kbd_read,
/* TODO 5: add write operation */
.write = kbd_write,
};

static int kbd_init(void)
{
int err;
err = register_chrdev_region(MKDEV(KBD_MAJOR, KBD_MINOR),
KBD_NR_MINORS, MODULE_NAME);
if (err != 0) {
pr_err("register_region failed: %d\n", err);
goto out;
}

/* TODO 1: request the keyboard I/O ports */
if(!request_region(I8042_DATA_REG+1,KBD_NR_MINORS, MODULE_NAME)) {
err = -EBUSY;
goto out_unregister;
}
if (!request_region(I8042_STATUS_REG+1, KBD_NR_MINORS, MODULE_NAME)) {
err = -EBUSY;
goto out_unregister;
}

/* TODO 3: initialize spinlock */
spin_lock_init(&devs[0].lock);
/* TODO 2: Register IRQ handler for keyboard IRQ (IRQ 1). */
err = request_irq(I8042_KBD_IRQ,kbd_interrupt_handler,
IRQF_SHARED,MODULE_NAME,&devs[0]);
if(err){
goto out_release_regions;
}

cdev_init(&devs[0].cdev, &kbd_fops);
cdev_add(&devs[0].cdev, MKDEV(KBD_MAJOR, KBD_MINOR), 1);

pr_notice("Driver %s loaded\n", MODULE_NAME);
return 0;

/*TODO 2: release regions in case of error */
out_release_regions:
release_region(I8042_STATUS_REG+1, KBD_NR_MINORS);
release_region(I8042_DATA_REG+1, KBD_NR_MINORS);
out_unregister:
unregister_chrdev_region(MKDEV(KBD_MAJOR, KBD_MINOR),
KBD_NR_MINORS);
out:
return err;
}

static void kbd_exit(void)
{
cdev_del(&devs[0].cdev);

/* TODO 2: Free IRQ. */
free_irq(I8042_KBD_IRQ,&devs[0]);
/* TODO 1: release keyboard I/O ports */
release_region(I8042_STATUS_REG+1,KBD_NR_MINORS);
release_region(I8042_DATA_REG+1,KBD_NR_MINORS);
unregister_chrdev_region(MKDEV(KBD_MAJOR, KBD_MINOR),
KBD_NR_MINORS);
pr_notice("Driver %s unloaded\n", MODULE_NAME);
}

module_init(kbd_init);
module_exit(kbd_exit);
  • 注意:当 insmod 这个驱动程序时可能会报错
1
2
3
root@qemux86:~# insmod skels/interrupts/kbd.ko
kbd: loading out-of-tree module taints kernel.
insmod: can't insert 'skels/interrupts/kbd.ko': Device or resource busy
  • 这是因为键盘 IO 已经有对应的驱动了:
1
2
3
root@qemux86:~# cat /proc/ioports | egrep "(0060|0064)"
0060-0060 : keyboard
0064-0064 : keyboard
  • 键盘 I/O 端口是在引导期间由内核注册的,我们将无法删除关联的模块
  • 因此,我们需要欺骗内核并注册 0x61 和 0x65 端口
1
2
3
4
5
6
7
root@qemux86:~/skels/interrupts# insmod kbd.ko                                  
kbd: loading out-of-tree module taints kernel.
Driver kbd loaded
root@qemux86:~/skels/interrupts# cat /proc/ioports | grep kbd
0061-0061 : kbd
root@qemux86:~/skels/interrupts# rmmod kbd
Driver kbd unloaded
  • 当中断注册完成以后,可以在 /proc/interrupts 中进行查看:
1
2
3
4
5
6
7
8
root@qemux86:~/skels/interrupts# cat /proc/interrupts                           
CPU0
0: 68 IO-APIC 2-edge timer
1: 9 IO-APIC 1-edge i8042, kbd
9: 0 IO-APIC 9-fasteoi acpi
10: 452 IO-APIC 10-fasteoi virtio1, virtio2, virtio5
11: 13 IO-APIC 11-fasteoi virtio3, virtio4, virtio0
12: 125 IO-APIC 12-edge i8042
  • 通过如下命令可以在虚拟机中打开键盘:(用来查看 kbd_interrupt_handler 是否正确)
1
QEMU_DISPLAY=gtk make boot 
  • 被这个多线程和锁搞得头痛,有时候莫名其妙陷入死锁,尝试使用 printk 打印时还报错,看了答案以后进行了大刀阔斧的修改才解决了问题
  • 另外 i8042_read_data 实际上是使用了系统自带的键盘 IO 端口(程序中自己申请的 IO 端口其实就是花架子,不影响程序流程)
  • 在程序输入输出的地方都需要用锁,但是如果在程序拿到锁的情况下发生中断,中断中也要拿同一个锁的情况下就会发生问题(硬件中断在任何时候都会发生)
1
2
3
spin_lock_irqsave(&data->lock,flag);
more = get_char(&ch,data);
spin_unlock_irqrestore(&data->lock,flag);
  • 所以使用 spin_lock_irqsave 在加锁的同时禁止中断

仔细对比了答案以后,又分析了一下之前死锁的原因,感觉问题应该出在 cat 中:

  • kbd_open 正常执行
  • kbd_read 有返回但是无限循环
  • kbd_release 根本不会执行,程序卡死在 kbd_read

我之前的代码是这样的:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static ssize_t kbd_read(struct file *file,  char __user *user_buffer,
size_t size, loff_t *offset)
{
struct kbd *data = (struct kbd *) file->private_data;
size_t read = min(BUFFER_SIZE - *offset, size);
unsigned long flag;
char ch;
bool more = true;
int i = 0;

if(read == 0){
printk("read full");
return 0;
}

/* TODO 4: read data from buffer */
for(i=0;i<read;i++){
spin_lock_irqsave(&data->lock,flag);
more = get_char(&ch,data);
spin_unlock_irqrestore(&data->lock,flag);

if(!more)
break;
if(put_user(ch,user_buffer++))
return -EFAULT;
}

return read;
}
  • 这个函数有个很明显的问题,就是 Read 的返回值不对(刚开始也没考虑这些问题)
  • 把它修改为以下代码后就没有问题了:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static ssize_t kbd_read(struct file *file,  char __user *user_buffer,
size_t size, loff_t *offset)
{
struct kbd *data = (struct kbd *) file->private_data;
size_t read = 0;
unsigned long flag;
char ch;
bool more = true;
int i = 0;

if(size == 0){
printk("read full");
return 0;
}

/* TODO 4: read data from buffer */
for(i=0;i<size;i++){
spin_lock_irqsave(&data->lock,flag);
more = get_char(&ch,data);
spin_unlock_irqrestore(&data->lock,flag);

if(!more)
break;
if(put_user(ch,user_buffer++))
return -EFAULT;
read ++;
}

return read;
}

基础知识

在多线程的程序中,进程中的全局变量与函数内定义的静态(static)变量,是各个线程都可以访问的共享变量,因此往往会有资源条件竞争的问题,常见的处理办法就是使用同步机制来维护资源

线程局部存储(Thread Local Storage,TLS)是一种另类的解决方式:

  • 它存储和维护一些线程相关的数据,存储的数据会被关联到当前线程中去,并不需要锁来维护
  • 本质上是为每一个使用该全局变量的线程都提供一个变量值的副本,每一个线程均可以独立地改变自己的副本,而不会和其它线程的副本冲突

TLS实现方式:API

Linux 中相关的 API:

1
2
3
4
int pthread_key_create(pthread_key_t *key, void (*destructor)(void*)); /* 构建一个pthread_key_t类型,确实是相当于一个key */
int pthread_key_delete(pthread_key_t key); /* 注销一个pthread_key_t */
void *pthread_getspecific(pthread_key_t key); /* 将与pthread_key_t相关联的数据读出来 */
int pthread_setspecific(pthread_key_t key, const void *value); /* 用于将value的副本存储于一数据结构中,并将其与调用线程以及pthread_key_t相关联 */
  • 结构关系图如下:

本人在实际操作这些 API 时遇到了许多问题,先看一个失败的案例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include<stdio.h>  
#include<pthread.h>
#include<string.h>

pthread_key_t p_key;

void *thread_func(void *args)
{
printf("%d is runing in %s\n",*(int*)args,__func__);
*(int*)args = *(int*)args + 1;
printf("%d is runing in %s\n",*(int*)args,__func__);
return (void*)0;
}

int main()
{
pthread_t p[24];
pthread_key_create(&p_key,NULL);
int *a = (int*)pthread_getspecific(p_key);
a = malloc(sizeof(int));
pthread_setspecific(p_key, a);
*a = 1;
for(int i=0;i<24;i++){
pthread_create(&p[i], NULL,thread_func,a);
}
return 0;
}
1
2
3
4
5
6
7
8
9
exp ./test                         
1 is runing in thread_func
2 is runing in thread_func
1 is runing in thread_func
2 is runing in thread_func
3 is runing in thread_func
5 is runing in thread_func
3 is runing in thread_func
......
  • 位于堆区的 a 本来应该充当 TLS 的作用,但各个线程好像还是共用了同一片空间
  • pthread_setspecific 对主线程进行了绑定,而忽略了其他线程(其实我还有多个测试案例:把 pthread_setspecific 放到线程函数里面,使用不同的 pthread_key_t 变量 …… 但这些操作都失败了)
  • 其实这里是我的理解有误区,认为这些 API 函数会自动帮我为不同的线程复制数据

接下来看一个成功的案例:(错误码 errno 的实现原理)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include<stdio.h>
#include<stdlib.h>
#include<errno.h>
#include<pthread.h>

void *thread_deal_func(void *arg);

#define TASK_NUM 2
pthread_t global_thread_no[TASK_NUM];

static pthread_key_t key;
static pthread_once_t key_once = PTHREAD_ONCE_INIT;

static void make_key(){
(void) pthread_key_create(&key, NULL);
}
int * _errno(){
int *ptr;
(void) pthread_once(&key_once, make_key);
if ((ptr = pthread_getspecific(key)) == NULL)
{
ptr = malloc(sizeof(int));
(void) pthread_setspecific(key, ptr);
}
return ptr ;
}

#define errno_test *_errno()
//int errno_test = 0;

int main(){
errno_test = 100;
int tmp = 0,i = 0;
for(i = 0;i < TASK_NUM; i++){
if((tmp=pthread_create(&global_thread_no[i],NULL,thread_deal_func,&i))!= 0){
printf("can't create thread: %s\n",strerror(tmp));
return -1;
}
}

while(1){
printf("man thread ,errno_test:%d\n",errno_test);
sleep(1);
}
return 0;
}

void *thread_deal_func(void *arg)
{
int number = *(int*)arg;
while(1){
errno_test += 1;
printf("thread number:%d,errno_test:%d\n",number,errno_test);
sleep(1);
}
}
  • 使用 TLS:
1
2
3
4
5
6
7
8
9
10
11
12
13
exp ./test
man thread ,errno_test:100
thread number:1,errno_test:1
thread number:2,errno_test:1
man thread ,errno_test:100
thread number:2,errno_test:2
thread number:1,errno_test:2
thread number:2,errno_test:3
thread number:1,errno_test:3
man thread ,errno_test:100
thread number:2,errno_test:4
man thread ,errno_test:100
thread number:1,errno_test:4
  • 使用全局变量:
1
2
3
4
5
6
7
8
9
10
exp ./test                         
thread number:1,errno_test:101
man thread ,errno_test:101
thread number:2,errno_test:102
thread number:2,errno_test:103
man thread ,errno_test:103
thread number:1,errno_test:104
thread number:2,errno_test:105
thread number:1,errno_test:106
man thread ,errno_test:106
  • 这里的 errno_test 看起来像是全局变量,其实是一个函数的返回值
1
#define	errno_test *_errno()
  • 每次使用 errno_test 时,都会调用 _errno() 函数
  • 在其中会申请与当前线程绑定的变量,并且调用 malloc 为其分配空间:
    • 对于各个线程来说,这些变量的地址不同了
    • 对于用户来说,各个线程拿到不同的变量,不会相互干扰

TLS实现方式:关键字

在 Linux 中还有一种更为高效的线程局部存储方法,就是使用关键字 __thread 来定义变量

  • 凡是带有 __thread 的变量,每个线程都拥有该变量的一份拷贝,且互不干扰
  • 线程局部存储中的变量将一直存在,直至线程终止,当线程终止时会自动释放这一存储

测试案例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include<stdio.h>
#include<stdlib.h>
#include <pthread.h>

__thread int var = 0;

void* task_entry(void* arg){
int idx = (int)arg;
int i;
printf("addr : %p\n",&var);
for (i = 0; i < 5; ++i) {
printf("thread:%d var = %d\n", idx, var += idx);
sleep(1);
}
}

int main(){
pthread_t pid1,pid2;
pthread_create(&pid1,NULL,task_entry,(void *)1);
pthread_create(&pid2,NULL,task_entry,(void *)2);
pthread_join(pid1,NULL);
pthread_join(pid2,NULL);
return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
exp ./test                         
addr : 0x7f20b3ebe6fc
thread:1 var = 1
addr : 0x7f20b36bd6fc
thread:2 var = 2
thread:1 var = 2
thread:2 var = 4
thread:2 var = 6
thread:1 var = 3
thread:1 var = 4
thread:2 var = 8
thread:2 var = 10
thread:1 var = 5
  • 两个线程并不会干扰,而且地址也不同

现在我们就分析一下 __thread 底层的逻辑,测试案例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include<stdio.h>
#include<stdlib.h>
#include <pthread.h>

__thread size_t name1 = 0x111111;
__thread size_t name2 = 0x222222;
size_t name3 = 0x333333;
size_t name4 = 0x444444;
__thread size_t name5 = 0x555555;
__thread size_t name6 = 0x666666;

void* task_entry1(){
printf("addr:0x%lx\n",name1);
name1 = 0x777777;
while(1){}
}

void* task_entry2(){
printf("addr:0x%lx\n",name2);
name1 = 0x888888;
while(1){}
}

int main(){
pthread_t pid1,pid2;
pthread_create(&pid1,NULL,task_entry1,NULL);
pthread_create(&pid2,NULL,task_entry2,NULL);
pthread_join(pid1,NULL);
pthread_join(pid2,NULL);
return 0;
}

打开 GDB 进行调试:

  • 在 GDB 中输入 tls 可以显示 TLS 的位置(在 ubuntu 中 TLS 存储在 FS 寄存器中)
1
2
pwndbg> tls 
tls : 0x7ffff7d99740
  • 而之前添加了 __thread 关键字的变量则存储在 TLS 的上方
1
2
3
4
5
6
7
8
9
pwndbg> telescope 0x7ffff7d99740-0x20
00:00000x7ffff7d99720 ◂— 0x111111
01:00080x7ffff7d99728 ◂— 0x222222 /* '"""' */
02:00100x7ffff7d99730 ◂— 0x555555 /* 'UUU' */
03:00180x7ffff7d99738 ◂— 0x666666 /* 'fff' */
04:00200x7ffff7d99740 ◂— 0x7ffff7d99740
05:00280x7ffff7d99748 —▸ 0x7ffff7d9a0a0 ◂— 0x1
06:00300x7ffff7d99750 —▸ 0x7ffff7d99740 ◂— 0x7ffff7d99740
07:00380x7ffff7d99758 ◂— 0x1
  • 函数 pthread_create 会在 heap 上申请一片区域:
1
2
3
4
5
6
7
Allocated chunk | PREV_INUSE
Addr: 0x555555559290
Size: 0x131

Allocated chunk | PREV_INUSE
Addr: 0x5555555593c0
Size: 0x131
  • 查看里面记录的信息:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
pwndbg> telescope 0x555555559290
00:00000x555555559290 ◂— 0x0
01:00080x555555559298 ◂— 0x131
02:00100x5555555592a0 ◂— 0x10
03:00180x5555555592a8 ◂— 0x0
04:00200x5555555592b0 ◂— 0x1
05:00280x5555555592b8 ◂— 0x0
06:00300x5555555592c0 —▸ 0x7ffff7d986e0 ◂— 0x777777 /* 'www' */
07:00380x5555555592c8 ◂— 0x0
08:00400x5555555592d0 —▸ 0x7ffff7d98650 —▸ 0x7ffff7f894a0 (_nl_global_locale) —▸ 0x7ffff7f856c0 (_nl_C_LC_CTYPE) —▸ 0x7ffff7f51fd9 (_nl_C_name) ◂— ...
09:00480x5555555592d8 ◂— 0x0
pwndbg> telescope 0x7ffff7d986e0
00:00000x7ffff7d986e0 ◂— 0x777777 /* 'www' */
01:00080x7ffff7d986e8 ◂— 0x222222 /* '"""' */
02:00100x7ffff7d986f0 ◂— 0x555555 /* 'UUU' */
03:00180x7ffff7d986f8 ◂— 0x666666 /* 'fff' */
04:0020│ r15 0x7ffff7d98700 ◂— 0x7ffff7d98700
05:00280x7ffff7d98708 —▸ 0x5555555592b0 ◂— 0x1
06:00300x7ffff7d98710 —▸ 0x7ffff7d98700 ◂— 0x7ffff7d98700
07:00380x7ffff7d98718 ◂— 0x1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
pwndbg> telescope 0x5555555593c0
00:00000x5555555593c0 ◂— 0x0
01:00080x5555555593c8 ◂— 0x131
02:00100x5555555593d0 ◂— 0x10
03:00180x5555555593d8 ◂— 0x0
04:00200x5555555593e0 ◂— 0x1
05:00280x5555555593e8 ◂— 0x0
06:00300x5555555593f0 —▸ 0x7ffff75976e0 ◂— 0x888888
07:00380x5555555593f8 ◂— 0x0
08:00400x555555559400 —▸ 0x7ffff7597650 —▸ 0x7ffff7f894a0 (_nl_global_locale) —▸ 0x7ffff7f856c0 (_nl_C_LC_CTYPE) —▸ 0x7ffff7f51fd9 (_nl_C_name) ◂— ...
09:00480x555555559408 ◂— 0x0
pwndbg> telescope 0x7ffff75976e0
00:00000x7ffff75976e0 ◂— 0x888888
01:00080x7ffff75976e8 ◂— 0x222222 /* '"""' */
02:00100x7ffff75976f0 ◂— 0x555555 /* 'UUU' */
03:00180x7ffff75976f8 ◂— 0x666666 /* 'fff' */
04:0020│ r9 0x7ffff7597700 ◂— 0x7ffff7597700
05:00280x7ffff7597708 —▸ 0x5555555593e0 ◂— 0x1
06:00300x7ffff7597710 —▸ 0x7ffff7597700 ◂— 0x7ffff7597700
07:00380x7ffff7597718 ◂— 0x1
  • 其实看到这里就很清晰了
  • 函数 pthread_create 在调用时会把主线程 TLS 上方存储 __thread 变量的内存空间给复制一份,添加到自己的 TLS 上方
  • 然后申请一片 heap 空间来记录数据
  • 如果 pthread_create 生成的线程想要写入带有 __thread 关键字的变量,就会操作自己 TLS 上方所记录的变量,而不会影响到其他线程

Character device drivers

实验室目标:

  • 了解字符设备驱动程序背后的概念
  • 了解可以在字符设备上执行的各种操作
  • 使用等待队列

设备驱动程序是与硬件设备交互的内核组件(通常是模块)

在UNIX中有两类设备文件:

  • 第一类设备,字符设备:(例如:键盘、鼠标、串行端口、声卡、操纵杆)
    • 慢速设备
    • 管理少量数据
    • 访问数据不需要频繁的查找查询
    • 通常,这些设备的 Read/Write 是按字节顺序执行
  • 第二类设备,块设备:(例如:硬盘驱动器、光盘、RAM 磁盘、磁带驱动器)
    • 数据量大
    • 数据按块组织
    • 搜索频繁
    • 对于这些设备,Read/Write 是在数据块级别完成的

因此,UNIX 提供了两种设备驱动程序:

  • 字符驱动 - character driven
  • 块驱动 - block driven
  • PS:Linux中还有一种网络驱动 - plot driven(不是本篇文章的重点)

对于这两种类型的设备驱动程序,Linux 内核提供了不同的 API,其中大多数参数都有直接含义:

  • fileinode:标识设备类型文件
  • size:要读取或写入的字节数
  • offset:要读取或写入的位移(将相应更新)
  • user_buffer:从中 Read/Write 的用户缓冲区
  • whence:搜索方式(搜索操作开始的位置)
  • cmdarg:用户发送到 ioctl 调用的参数(IO控制)

Majors and Minors

Linux 中,设备具有与之关联的唯一固定标识符,由两部分组成:major and minor

  • major:标识设备的类型(IDE 磁盘、SCSI 磁盘、串行端口等)
  • minor:标识具体的设备(第一个磁盘、第二个串行端口等)

PS:因为物理设备已经被驱动抽象为“在 Linux 上运行的软件”,所以 Linux 可以通过这种方式定位具体的物理设备

Inode and File

从文件系统的角度来看,inode 表示文件:

  • inode 的属性是与文件关联的大小,权限,时间
  • inode 唯一标识文件系统中的文件

从用户的角度来看,file 表示文件:

  • file 的属性是 inode,文件名,文件打开属性,文件位置
  • 所有打开的文件都有与之关联的 file 结构体

回到设备驱动程序,有两个实体几乎总是具有标准的使用方式:

  • inode:更用于确定执行操作的设备的 major and minor
  • file:用于确定打开文件的标志,还用于保存和访问(以后)私有数据

Registration and unregistration of character devices

设备的注册/注销是通过指定 major and minor 设备来实现的

  • 类型 dev_t 用于保留设备的标识符(major and minor),并且可以使用 MKDEV 宏获取

对于设备标识符的静态分配和静态注销:

1
2
3
4
#include <linux/fs.h>

int register_chrdev_region(dev_t first, unsigned int count, char *name); /* 创建一个字符设备区 */
void unregister_chrdev_region(dev_t first, unsigned int count); /* 删除一个字符设备区 */

分配标识符后,必须初始化字符设备并且必须通知内核,然后才能注册/删除字符设备:

1
2
3
4
5
#include <linux/cdev.h>

void cdev_init(struct cdev *cdev, struct file_operations *fops); /* 初始化字符设备,并通知内核 */
int cdev_add(struct cdev *dev, dev_t num, unsigned int count); /* 注册字符设备 */
void cdev_del(struct cdev *dev); /* 删除字符设备 */

使用案例如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include <linux/fs.h>
#include <linux/cdev.h>

#define MY_MAJOR 42
#define MY_MAX_MINORS 5

struct my_device_data {
struct cdev cdev;
/* my data starts here */
//...
};

struct my_device_data devs[MY_MAX_MINORS];

const struct file_operations my_fops = {
.owner = THIS_MODULE,
.open = my_open,
.read = my_read,
.write = my_write,
.release = my_release,
.unlocked_ioctl = my_ioctl
};

int init_module(void)
{
int i, err;
err = register_chrdev_region(MKDEV(MY_MAJOR, 0), MY_MAX_MINORS,
"my_device_driver"); /* MKDEV获取设备标识符 */
if (err != 0) {
return err;
}

for(i = 0; i < MY_MAX_MINORS; i++) {
cdev_init(&devs[i].cdev, &my_fops); /* 绑定自己设置的file_operations */
cdev_add(&devs[i].cdev, MKDEV(MY_MAJOR, i), 1);
}

return 0;
}

void cleanup_module(void)
{
int i;
for(i = 0; i < MY_MAX_MINORS; i++) {
/* release devs[i] fields */
cdev_del(&devs[i].cdev);
}
unregister_chrdev_region(MKDEV(MY_MAJOR, 0), MY_MAX_MINORS);
}

同一个 dev_t 可以注册多个字符设备,每次 open(DEVICE_PATH, O_RDONLY) 时,本质上是和一个具体的字符设备进行交互(使用 struct cdev 父类结构体中的数据)

因此,如果两个进程访问同一个字符设备,就很可能在临界区引发安全问题,所以我们要在字符设备的 open 上加锁,禁止其被二次打开

为了程序的并发性,通常我们需要为同一个 dev_t 注册多个字符设备,在进程 open 提供不同的字符设备供其使用

Access to the address space of the process

设备的驱动程序是应用程序和硬件之间的接口,因此,我们经常必须访问用户空间数据(但不能以取消引用用户空间指针的方式,来直接访问用户空间)

直接访问用户空间指针可能会导致:

  • 不正确的行为(根据体系结构的不同,用户空间指针可能无效或映射到内核空间)
  • 内核 oops(用户模式指针可以引用非驻留内存区域)
  • 安全问题

因此通过调用下面的宏函数来正确访问用户空间数据:

1
2
3
4
5
6
#include <asm/uaccess.h>

put_user(type val, type *address);
get_user(type val, type *address);
unsigned long copy_to_user(void __user *to, const void *from, unsigned long n);
unsigned long copy_from_user(void *to, const void __user *from, unsigned long n);

下图说明了 Read 操作以及如何在用户空间和驱动程序之间传输数据:

  • 当驱动 driver 有足够多的可用数据时,它将准确地将所需 size 的数据传输给用户
  • 当驱动 driver 没有足够多的可用数据时,它将把所有的可用数据传输给用户

Read 操作的案例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static int my_read(struct file *file, char __user *user_buffer,
size_t size, loff_t *offset)
{
struct my_device_data *my_data = (struct my_device_data *) file->private_data;
ssize_t len = min(my_data->size - *offset, size);

if (len <= 0)
return 0;

/* read data from my_data->buffer to user buffer */
if (copy_to_user(user_buffer, my_data->buffer + *offset, len))
return -EFAULT;

*offset += len;
return len;
}

下图说明了 Write 操作以及如何在用户空间和驱动程序之间传输数据:

  • 写入操作将响应来自用户空间的写入请求,其范围不会大于最大驱动程序容量 MAXSIZ

Write 操作的案例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static int my_write(struct file *file, const char __user *user_buffer,
size_t size, loff_t * offset)
{
struct my_device_data *my_data = (struct my_device_data *) file->private_data;
ssize_t len = min(my_data->size - *offset, size);

if (len <= 0)
return 0;

/* read data from user buffer to my_data->buffer */
if (copy_from_user(my_data->buffer + *offset, user_buffer, len))
return -EFAULT;

*offset += len;
return len;
}

Ioctl

除了 Read 和 Write 操作之外,驱动程序还需要能够执行某些物理设备控制任务(这些操作是通过实现函数来完成的)

可用通过如下函数完成此操作:

1
static long my_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
  • file:打开的设备文件描述符
  • cmd:从用户空间发送的命令
  • arg:指向用户空间的指针,使用 copy_from_user 来安全地获取其值

在实现该功能之前,必须选择与命令对应的数字(建议使用宏定义 _IOC(dir, type, nr, size) 来完成此操作),然后在一个 Switch-Case 中完成各个命令

使用案例如下:(在用户空间调用 ioctl,对应到内核就是 my_ioctl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include <asm/ioctl.h>

#define MY_IOCTL_IN _IOC(_IOC_WRITE, 'k', 1, sizeof(my_ioctl_data))

static long my_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
{
struct my_device_data *my_data =
(struct my_device_data*) file->private_data;
my_ioctl_data mid;

switch(cmd) {
case MY_IOCTL_IN:
if( copy_from_user(&mid, (my_ioctl_data *) arg,
sizeof(my_ioctl_data)) )
return -EFAULT;

/* process data and execute command */
break;
default:
return -ENOTTY;
}

return 0;
}

Waiting queues

等待队列是正在等待特定事件的进程的列表,使用 wait_queue_head_t 类型定义,可由函数/宏使用:

1
2
3
4
5
6
7
8
9
10
#include <linux/wait.h>

DECLARE_WAIT_QUEUE_HEAD(wq_name); /* 在编译时初始化队列 */
void init_waitqueue_head(wait_queue_head_t *q); /* 初始化队列 */
int wait_event(wait_queue_head_t q, int condition); /* 在条件为false时将当前线程添加到队列中,将其设置为TASK_UNINTERRUPTIBLE,并调度新线程 */
int wait_event_interruptible(wait_queue_head_t q, int condition); /* 在条件为false时将当前线程添加到队列中,将其设置为TASK_INTERRUPTIBLE,并调度新线程 */
int wait_event_timeout(wait_queue_head_t q, int condition, int timeout); /* 和wait_event一样,只是timeout耗尽时同样会退出 */
int wait_event_interruptible_timeout(wait_queue_head_t q, int condition, int timeout); /* 和wait_event_interruptible一样,只是timeout耗尽时同样会退出 */
void wake_up(wait_queue_head_t *q); /* 从目标等待队列中唤醒一个进程 */
void wake_up_interruptible(wait_queue_head_t *q); /* 仅唤醒状态为TASK_INTERRUPTIBLE的线程 */

Exercises

要解决练习,您需要执行以下步骤:

  • 从模板准备 skeletons
  • 构建模块
  • 将模块复制到虚拟机
  • 启动 VM 并在 VM 中测试模块
1
2
3
make clean
LABS=device_drivers make skels
make build

直接看最终代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/*
* Character device drivers lab
*
* All tasks
*/

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/wait.h>

#include "../include/so2_cdev.h"

MODULE_DESCRIPTION("SO2 character device");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

#define LOG_LEVEL KERN_INFO

#define MY_MAJOR 42
#define MY_MINOR 0
#define NUM_MINORS 1
#define MODULE_NAME "so2_cdev"
#define MESSAGE "hello\n"
#define IOCTL_MESSAGE "Hello ioctl"

#ifndef BUFSIZ
#define BUFSIZ 4096
#endif

struct so2_device_data {
/* TODO 2: add cdev member */
struct cdev cdev;
/* TODO 4: add buffer with BUFSIZ elements */
char buffer[BUFSIZ];
/* TODO 7: extra members for home */
wait_queue_head_t queue;
/* TODO 3: add atomic_t access variable to keep track if file is opened */
atomic_t access;
};

struct so2_device_data devs[NUM_MINORS];

static int so2_cdev_open(struct inode *inode, struct file *file)
{
struct so2_device_data *data;

/* TODO 2: print message when the device file is open. */
printk("message:%s",MESSAGE);
/* TODO 3: inode->i_cdev contains our cdev struct, use container_of to obtain a pointer to so2_device_data */
data = container_of(inode->i_cdev, struct so2_device_data, cdev);
file->private_data = data;
/* TODO 3: return immediately if access is = 0, use atomic_cmpxchg */
/* 这里我的想法和原实验不一样:
设置已经open的字符设备的"data->access"为'1',
设置没有open的字符设备的"data->access"为'0',
初始化时,所有的"data->access"都为'0',表明所有字符设备都没有open过
如果一个进程尝试open一个"data->access"为'1'的'/dev/so2_cdev'字符设备,就不会立刻返回,而是执行后面的语句并睡眠
(so2_device_data也属于临界区,如果两个进程open同一个字符设备就可能有安全问题)
*/
if(!atomic_cmpxchg(&data->access,0,1)){
return 0;
}

set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(10 * HZ);
return 0;
}

static int
so2_cdev_release(struct inode *inode, struct file *file)
{
/* TODO 2: print message when the device file is closed. */
printk("message:%s",MESSAGE);
#ifndef EXTRA
struct so2_device_data *data =
(struct so2_device_data *) file->private_data;
/* TODO 3: reset access variable to 0, use atomic_set */
atomic_set(&data->access,0);
#endif
return 0;
}

static ssize_t
so2_cdev_read(struct file *file,
char __user *user_buffer,
size_t size, loff_t *offset)
{
struct so2_device_data *data =
(struct so2_device_data *) file->private_data;
size_t to_read;

#ifdef EXTRA
/* TODO 7: extra tasks for home */
#endif

/* TODO 4: Copy data->buffer to user_buffer, use copy_to_user */
to_read = min(BUFSIZ-*offset,size);
if(copy_to_user(user_buffer,data->buffer+*offset,to_read))
return -EFAULT;
*offset += to_read;
return to_read;
}

static ssize_t
so2_cdev_write(struct file *file,
const char __user *user_buffer,
size_t size, loff_t *offset)
{
struct so2_device_data *data =
(struct so2_device_data *) file->private_data;


/* TODO 5: copy user_buffer to data->buffer, use copy_from_user */
size_t to_write = min(BUFSIZ-*offset,size);
if(copy_from_user(data->buffer+*offset,user_buffer,to_write))
return -EFAULT;
*offset += to_write;
/* TODO 7: extra tasks for home */

return size;
}

static long
so2_cdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct so2_device_data *data =
(struct so2_device_data *) file->private_data;
int ret = 0;
int remains;
printk("ioctl start");
switch (cmd) {
/* TODO 6: if cmd = MY_IOCTL_PRINT, display IOCTL_MESSAGE */
case MY_IOCTL_PRINT:
printk("%s",IOCTL_MESSAGE);
break;
/* TODO 7: extra tasks, for home */
case MY_IOCTL_SET_BUFFER:
if(copy_from_user(data->buffer,(char *)arg,strlen(arg)))
return -EFAULT;
printk("buffer from usr:%s",data->buffer);
break;
case MY_IOCTL_GET_BUFFER:
if(copy_to_user((char *)arg,data->buffer,strlen(data->buffer)))
return -EFAULT;
break;
case MY_IOCTL_DOWN:
wait_event(data->queue,0); /* 这里有小迷,当前进程直接进入等待队列,程序卡死 */
break;
case MY_IOCTL_UP:
wake_up(&data->queue); /* 这里就更迷了,根本不能确定唤醒了那个进程 */
break;
default:
ret = -EINVAL;
}

return ret;
}

static const struct file_operations so2_fops = {
.owner = THIS_MODULE,
/* TODO 2: add open and release functions */
.open = so2_cdev_open,
.release = so2_cdev_release,
/* TODO 4: add read function */
.read = so2_cdev_read,
/* TODO 5: add write function */
.write = so2_cdev_write,
/* TODO 6: add ioctl function */
.unlocked_ioctl = so2_cdev_ioctl,
};

static int so2_cdev_init(void)
{
int err;
int i;

/* TODO 1: register char device region for MY_MAJOR and NUM_MINORS starting at MY_MINOR */
err = register_chrdev_region(MKDEV(MY_MAJOR, 0),NUM_MINORS,MODULE_NAME);


for (i = 0; i < NUM_MINORS; i++) {
#ifdef EXTRA
/* TODO 7: extra tasks, for home */
#else
/* TODO 4: initialize buffer with MESSAGE string */
strcpy(devs[i].buffer,MESSAGE);
#endif
/* TODO 7: extra tasks for home */
init_waitqueue_head(&devs[i].queue);
/* TODO 3: set access variable to 0, use atomic_set */
atomic_set(&devs[i].access,0);
/* TODO 2: init and add cdev to kernel core */
cdev_init(&devs[i].cdev,&so2_fops);
cdev_add(&devs[i].cdev,MKDEV(MY_MAJOR,0),NUM_MINORS);
}

return 0;
}

static void so2_cdev_exit(void)
{
int i;

for (i = 0; i < NUM_MINORS; i++) {
/* TODO 2: delete cdev from kernel core */
cdev_del(&devs[i].cdev);
}

/* TODO 1: unregister char device region, for MY_MAJOR and NUM_MINORS starting at MY_MINOR */
unregister_chrdev_region(MKDEV(MY_MAJOR,0),NUM_MINORS);
}

module_init(so2_cdev_init);
module_exit(so2_cdev_exit);
  • 在用户态执行的测试代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/*
* SO2 Lab - Linux device drivers (#4)
* User-space test file
*/

#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include "../include/so2_cdev.h"

#define DEVICE_PATH "/dev/so2_cdev"

/*
* prints error message and exits
*/

static void error(const char *message)
{
perror(message);
exit(EXIT_FAILURE);
}

/*
* print use case
*/

static void usage(const char *argv0)
{
printf("Usage: %s <options>\n options:\n"
"\tp - print\n"
"\ts string - set buffer\n"
"\tg - get buffer\n"
"\td - down\n"
"\tu - up\n"
"\tn - open with O_NONBLOCK and read data\n", argv0);
exit(EXIT_FAILURE);
}

/*
* Sample run:
* ./so2_cdev_test p ; print ioctl message
* ./so2_cdev_test d ; wait on wait_queue
* ./so2_cdev_test u ; wait on wait_queue
*/

int main(int argc, char **argv)
{
int fd;
char buffer[BUFFER_SIZE];

if (argc < 2)
usage(argv[0]);

if (strlen(argv[1]) != 1)
usage(argv[0]);

fd = open(DEVICE_PATH, O_RDONLY);
if (fd < 0) {
perror("open");
exit(EXIT_FAILURE);
}

switch (argv[1][0]) {
case 'p': /* print */
if (ioctl(fd, MY_IOCTL_PRINT, 0) < 0) {
perror("ioctl");
exit(EXIT_FAILURE);
}

break;
case 's': /* set buffer */
if (argc < 3)
usage(argv[0]);
memset(buffer, 0, BUFFER_SIZE);
strncpy(buffer, argv[2], BUFFER_SIZE);
if (ioctl(fd, MY_IOCTL_SET_BUFFER, buffer) < 0) {
perror("ioctl");
exit(EXIT_FAILURE);
}
break;
case 'g': /* get buffer */
if (ioctl(fd, MY_IOCTL_GET_BUFFER, buffer) < 0) {
perror("ioctl");
exit(EXIT_FAILURE);
}
buffer[BUFFER_SIZE-1] = 0;
printf("IOCTL buffer contains %s\n", buffer);
break;
case 'd': /* down */
if (ioctl(fd, MY_IOCTL_DOWN, 0) < 0) {
perror("ioctl");
exit(EXIT_FAILURE);
}
break;
case 'u': /* up */
if (ioctl(fd, MY_IOCTL_UP, 0) < 0) {
perror("ioctl");
exit(EXIT_FAILURE);
}
break;
case 'n':
if (fcntl(fd, F_SETFL, O_RDONLY | O_NONBLOCK) < 0) {
perror("fcntl");
exit(EXIT_FAILURE);
}

if (read(fd, buffer, BUFFER_SIZE) < 0) {
perror("read");
exit(EXIT_FAILURE);
}
buffer[BUFFER_SIZE-1] = 0;
printf("Device buffer contains %s\n", buffer);
break;
default:
error("Wrong parameter");
}

close(fd);

return 0;
}
  • 结果:
1
2
root@qemux86:~/skels/device_drivers/kernel# insmod so2_cdev.ko
root@qemux86:~/skels/device_drivers# mknod /dev/so2_cdev c 42 0
1
2
3
4
5
root@qemux86:~/skels/device_drivers# ./user/so2_cdev_test p                     
message:hello
ioctl start
Hello ioctl
message:hello
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
root@qemux86:~/skels/device_drivers# ./user/so2_cdev_test g                     
message:hello
IOCTL buffer contains hello

ioctl start
message:hello
root@qemux86:~/skels/device_drivers# ./user/so2_cdev_test s yhellow
message:hello
ioctl start
buffer from usr:yhellow
message:hello
root@qemux86:~/skels/device_drivers# ./user/so2_cdev_test g
message:hello
IOCTL buffer contains yhellow
ioctl start
message:hello
1
2
3
4
root@qemux86:~/skels/device_drivers# cat /dev/so2_cdev                          
message:hello
hello
message:hello

感觉在锁和等待队列这一块还不是很熟悉,还需要多写代码

Kernel API

  • 熟悉基本的 Linux kernel API
  • 内存分配机制说明
  • locking 机制说明

内核是一个独立的实体,不能使用用户空间中的库(甚至不能使用 libc)

总之,内核编程基于一个全新的独立API,无论我们指的是 POSIX 还是 ANSI-C,它都与用户空间 API 无关

Accessing memory

内核编程中的一个重要区别是如何访问和分配内存,由于内核编程非常接近物理机,因此内存管理有重要的规则

首先,它适用于几种类型的内存:

  • 物理内存
  • 内核地址空间中的虚拟内存
  • 进程地址空间的虚拟内存
  • 驻留内存(那些被映射到进程虚拟内存空间的物理内存)

对于驻留内存而言,进程虚拟地址和内核虚拟地址有不同的情况:

  • 进程的地址空间中的虚拟内存不能被视为驻留:
    • page 可能被交换,或者由于需求分页机制而根本不存在于物理内存中
  • 内核地址空间中的内存可以驻留或不驻留:
    • 模块的数据段和代码段以及进程的内核堆栈都是驻留的
    • 动态内存可能是驻留的,也可能不是驻留的,具体取决于它的分配方式
  • 使用驻留内存时,事情很简单,因为可以随时访问驻留内存
  • 如果使用非驻留内存,则只能从某些上下文中访问它
  • 因此,当操作系统检测到此类访问时,它将采取严厉措施(阻止或重置系统以防止严重损坏)

进程的虚拟内存通常不能直接从内核访问,但在某些情况下,设备驱动程序需要执行此操作:

  • 典型情况是设备驱动程序需要从用户空间访问缓冲区
  • 在这种情况下,设备驱动程序必须使用特殊功能,并且不能直接访问缓冲区
  • 这是防止访问无效内存区域所必需的

相对于用户空间调度,内核的另一个区别是栈:

  • 栈的大小是固定且有限的(在 Linux 中使用 4K 堆栈,在视窗中使用 12K 堆栈)
  • 因此,应避免在堆栈上分配大型结构或使用递归调用

Contexts of execution

关于内核执行,我们区分两个上下文:

  • 进程上下文:
    • 运行代码作为系统调用的结果时
    • 在内核线程的上下文中运行时
  • 中断上下文:
    • 在例程中运行以处理中断时
    • 可延迟操作时

某些内核 API 调用可能会阻止当前进程(例如使用信号量或等待条件变量),在这种情况下,进程将进入等待队列,并且让另一个进程运行

Locking

内核编程最重要的特性之一是并行性,Linux 支持具有多个处理器和内核抢占性的 SMP 系统

这使得内核编程更加困难,因为对全局变量的访问必须与自旋锁基元 spinlock primitives 或阻塞基元 blocking primitives 同步:

  • 在受自旋锁保护的关键区域中运行的代码,不允许挂起当前进程
  • 当进程开始自旋时,其占用的 CPU 资源也不会释放
  • 因此,尽可能少使用自旋锁

Preemptivity

Linux 使用抢占式内核

抢占式多任务处理的概念是指:操作系统在其量程(时间片)到期时,强制中断在用户空间中运行的进程,以便运行另一个进程

  • 由于抢占性,当我们需要从不同进程上下文中运行的两部分代码之间共享资源时,我们需要使用 synchronization primitives 同步基元来保护自己,即使在单个处理器的情况下也是如此

Convention indicating errors

对于 Linux 内核编程,用于调用函数以指示成功的约定与在 UNIX 编程中相同:

  • 0 表示成功,或 0 以外的值表示失败(返回负值)

详尽的错误列表和摘要解释可以在 include/asm-generic/errno-base.hincludes/asm-generic/ernno.h

Exercises

要解决练习,您需要执行以下步骤:

  • 从模板准备 skeletons
  • 构建模块
  • 将模块复制到虚拟机
  • 启动 VM 并在 VM 中测试模块
1
2
3
make clean
LABS=kernel_api make skels
make build
1
2
➜  kernel_api git:(master) ✗ ls
1-mem 2-sched-spin 3-memory 4-list 5-list-full 6-list-sync 7-list-test

1.Memory allocation in Linux kernel:

  • 观察调用 kmalloc() 对内存的分配情况
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/*
* Kernel API lab
*
* mem.c - Memory allocation in Linux
*/

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/ctype.h>

MODULE_DESCRIPTION("Print memory");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

static char *mem;

static int mem_init(void)
{
size_t i;

mem = kmalloc(4096 * sizeof(*mem), GFP_KERNEL);
if (mem == NULL)
goto err_mem;

pr_info("chars: ");
for (i = 0; i < 4096; i++) {
if (isalpha(mem[i]))
printk(KERN_CONT "%c ", mem[i]);
}
pr_info("\n");

return 0;

err_mem:
return -1;
}

static void mem_exit(void)
{
kfree(mem);
}

module_init(mem_init);
module_exit(mem_exit);
  • 结果如下:(printk 默认换行,可以使用 KERN_CONT 禁止换行)
1
2
3
4
5
6
7
8
9
10
mem: loading out-of-tree module taints kernel.                                  
chars: Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z
Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z
Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z
Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z
Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z
Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z
Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z
Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z
Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z

2.Sleeping in atomic context:

  • 熟悉自旋锁的使用:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/*
* Kernel API lab
*
* sched-spin.c: Sleeping in atomic context
*/

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched.h>

MODULE_DESCRIPTION("Sleep while atomic");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

static int sched_spin_init(void)
{
spinlock_t lock;

spin_lock_init(&lock);

/* TODO 0: Use spin_lock to aquire the lock */
spin_lock(&lock);
set_current_state(TASK_INTERRUPTIBLE);
/* Try to sleep for 5 seconds. */
schedule_timeout(5 * HZ);

/* TODO 0: Use spin_unlock to release the lock */
spin_unlock(&lock);
return 0;
}

static void sched_spin_exit(void)
{
}

module_init(sched_spin_init);
module_exit(sched_spin_exit);
  • 加载内核模块时报出了 scheduling while atomic 这个错误,说是会污染内核(当内核受到污染意味着内核处于社区不支持的状态)
1
2
3
4
5
6
7
8
root@qemux86:~/skels/kernel_api/2-sched-spin# insmod sched-spin.ko              
sched_spin: loading out-of-tree module taints kernel.
BUG: scheduling while atomic: insmod/239/0x00000002
1 lock held by insmod/239:
#0: c585bdb8 (&lock){+.+.}-{2:2}, at: sched_spin_init+0x32/0x90 [sched_spin]
Modules linked in: sched_spin(O+)
CPU: 0 PID: 239 Comm: insmod Tainted: G O 5.10.14+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04
  • 在错误消息中按照包含 BUG 的说明:不能在原子操作中 sleep(自旋锁是用原子操作实现的)

3.Working with kernel memory:

  • 为结构体 struct task_info 分配内存并初始化其字段
  • 为当前进程、父进程、下一进程、下一进程的下一进程分配结构体 struct task_info
  • 显示四个结构
  • 释放结构占用的内存
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/*
* SO2 lab3 - task 3
*/

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>

MODULE_DESCRIPTION("Memory processing");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

struct task_info {
pid_t pid;
unsigned long timestamp;
};

static struct task_info *ti1, *ti2, *ti3, *ti4;

static struct task_info *task_info_alloc(int pid)
{
struct task_info *ti;

/* TODO 1: allocated and initialize a task_info struct */
ti = (struct task_info*)kmalloc(sizeof(struct task_info),NULL);
ti->pid = pid;
ti->timestamp = jiffies;

return ti;
}

static int memory_init(void)
{
struct task_struct *p;
/* TODO 2: call task_info_alloc for current pid */
p = current;
ti1 = task_info_alloc(p->pid);
/* TODO 2: call task_info_alloc for parent PID */
p = current->parent;
ti2 = task_info_alloc(p->pid);
/* TODO 2: call task_info alloc for next process PID */
p = next_task(p);
ti3 = task_info_alloc(p->pid);
/* TODO 2: call task_info_alloc for next process of the next process */
p = next_task(next_task(p));
ti4 = task_info_alloc(p->pid);
return 0;

}

static void memory_exit(void)
{
/* TODO 3: print ti* field values */
printk("%d:%d",ti1->pid,ti1->timestamp);
printk("%d:%d",ti2->pid,ti1->timestamp);
printk("%d:%d",ti3->pid,ti1->timestamp);
printk("%d:%d",ti4->pid,ti1->timestamp);
/* TODO 4: free ti* structures */
kfree(ti1);
kfree(ti2);
kfree(ti3);
kfree(ti4);
}

module_init(memory_init);
module_exit(memory_exit);
  • 结果:
1
2
3
4
5
6
root@qemux86:~/skels/kernel_api/3-memory# insmod memory.ko                      
memory: loading out-of-tree module taints kernel.
root@qemux86:~/skels/kernel_api/3-memory# rmmod memory.ko
237:-48258
213:-48258
214:-48258

4.Working with kernel lists:

  • 熟悉 Linux 链表的使用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/*
* Kernel API lab
*
* list.c: Working with lists
*
*/

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/sched/signal.h>

MODULE_DESCRIPTION("Use list to process task info");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

struct task_info {
pid_t pid;
unsigned long timestamp;
struct list_head list;
};

static struct list_head head;

static struct task_info *task_info_alloc(int pid)
{
struct task_info *ti;

ti = kmalloc(sizeof(*ti), GFP_KERNEL);
if (ti == NULL)
return NULL;
ti->pid = pid;
ti->timestamp = jiffies;

return ti;
}

static void task_info_add_to_list(int pid)
{
struct task_info *ti;

/* TODO 1: Allocate task_info and add it to list */
ti = task_info_alloc(pid);
list_add(&ti->list,&head);
}

static void task_info_add_for_current(void)
{
/* Add current, parent, next and next of next to the list */
task_info_add_to_list(current->pid);
task_info_add_to_list(current->parent->pid);
task_info_add_to_list(next_task(current)->pid);
task_info_add_to_list(next_task(next_task(current))->pid);
}

static void task_info_print_list(const char *msg)
{
struct list_head *p;
struct task_info *ti;

pr_info("%s: [ ", msg);
list_for_each(p, &head) {
ti = list_entry(p, struct task_info, list);
pr_info("(%d, %lu) ", ti->pid, ti->timestamp);
}
pr_info("]\n");
}

static void task_info_purge_list(void)
{
struct list_head *p, *tmp;
struct task_info *ti;

/* TODO 2: Iterate over the list and delete all elements */
list_for_each_safe(p, tmp, &head) {
ti = list_entry(p, struct task_info, list);
list_del(p);
kfree(ti);
}
}

static int list_init(void)
{
INIT_LIST_HEAD(&head);
task_info_add_for_current();
return 0;
}

static void list_exit(void)
{
task_info_print_list("before exiting");
task_info_purge_list();
}

module_init(list_init);
module_exit(list_exit);
  • 结果:
1
2
3
4
5
6
7
8
9
root@qemux86:~/skels/kernel_api/4-list# insmod list.ko                          
root@qemux86:~/skels/kernel_api/4-list# rmmod list.ko
216:-48258
before exiting: [
(1, 4294930828)
(0, 4294930828)
(213, 4294930828)
(239, 4294930828)
]

5.Working with kernel lists for process handling:

  • 熟悉 Linux 链表的使用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/*
* Kernel API lab
*
* list-full.c: Working with lists (advanced)
*/

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/sched/signal.h>

MODULE_DESCRIPTION("Full list processing");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

struct task_info {
pid_t pid;
unsigned long timestamp;
atomic_t count;
struct list_head list;
};

static struct list_head head;

static struct task_info *task_info_alloc(int pid)
{
struct task_info *ti;

ti = kmalloc(sizeof(*ti), GFP_KERNEL);
if (ti == NULL)
return NULL;
ti->pid = pid;
ti->timestamp = jiffies;
atomic_set(&ti->count, 0);

return ti;
}

static struct task_info *task_info_find_pid(int pid)
{
struct list_head *p;
struct task_info *ti;

/* TODO 1: Look for pid and return task_info or NULL if not found */
list_for_each(p, &head) {
ti = list_entry(p, struct task_info, list);
if(ti->pid == pid){
return ti;
}
}
return NULL;
}

static void task_info_add_to_list(int pid)
{
struct task_info *ti;

ti = task_info_find_pid(pid);
if (ti != NULL) {
ti->timestamp = jiffies;
atomic_inc(&ti->count);
return;
}
ti = task_info_alloc(pid);
list_add(&ti->list, &head);
}

static void task_info_add_for_current(void)
{
task_info_add_to_list(current->pid);
task_info_add_to_list(current->parent->pid);
task_info_add_to_list(next_task(current)->pid);
task_info_add_to_list(next_task(next_task(current))->pid);
}

static void task_info_print_list(const char *msg)
{
struct list_head *p;
struct task_info *ti;

pr_info("%s: [ ", msg);
list_for_each(p, &head) {
ti = list_entry(p, struct task_info, list);
pr_info("(%d, %lu) ", ti->pid, ti->timestamp);
}
pr_info("]\n");
}

static void task_info_remove_expired(void)
{
struct list_head *p, *q;
struct task_info *ti;

list_for_each_safe(p, q, &head) {
ti = list_entry(p, struct task_info, list);
if (jiffies - ti->timestamp > 3 * HZ && atomic_read(&ti->count) < 5) {
list_del(p);
kfree(ti);
}
}
}

static void task_info_purge_list(void)
{
struct list_head *p, *q;
struct task_info *ti;

list_for_each_safe(p, q, &head) {
ti = list_entry(p, struct task_info, list);
list_del(p);
kfree(ti);
}
}

static int list_full_init(void)
{
INIT_LIST_HEAD(&head);

task_info_add_for_current();
task_info_print_list("after first add");

set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(5 * HZ);

return 0;
}

static void list_full_exit(void)
{
struct task_info *ti;

/* TODO 2: Ensure that at least one task is not deleted */
ti = task_info_find_pid(current->parent->pid);
if(ti == NULL){
printk("not find pid: %d",ti->pid);
}
else{
printk("find pid: %d",ti->pid);
}

list_del(&ti->list);
task_info_remove_expired();
list_add(&ti->list, &head);
task_info_print_list("after removing expired");
task_info_purge_list();
task_info_remove_expired();
}

module_init(list_full_init);
module_exit(list_full_exit);
  • 结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
root@qemux86:~/skels/kernel_api/5-list-full# insmod list-full.ko                
list_full: loading out-of-tree module taints kernel.
after first add: [
(1, 4294916763)
(0, 4294916763)
(213, 4294916763)
(238, 4294916763)
]
root@qemux86:~/skels/kernel_api/5-list-full# rmmod list-full.ko
find pid: 213
after removing expired: [
(213, 4294916763)
]

6.Synchronizing list work:

  • 熟悉 Linux 锁的使用
  • 熟悉 Linux 符号的导出
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/*
* Linux API lab
*
* list-sync.c - Synchronize access to a list
*/

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/sched/signal.h>

MODULE_DESCRIPTION("Full list processing with synchronization");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

struct task_info {
pid_t pid;
unsigned long timestamp;
atomic_t count;
struct list_head list;
};

static struct list_head head;

/* TODO 1: you can use either a spinlock or rwlock, define it here */
DEFINE_SPINLOCK(lock);

static struct task_info *task_info_alloc(int pid)
{
struct task_info *ti;

ti = kmalloc(sizeof(*ti), GFP_KERNEL);
if (ti == NULL)
return NULL;
ti->pid = pid;
ti->timestamp = jiffies;
atomic_set(&ti->count, 0);

return ti;
}

static struct task_info *task_info_find_pid(int pid)
{
struct list_head *p;
struct task_info *ti;

list_for_each(p, &head) {
ti = list_entry(p, struct task_info, list);
if (ti->pid == pid) {
return ti;
}
}

return NULL;
}

static void task_info_add_to_list(int pid)
{
struct task_info *ti;

/* TODO 1: Protect list, is this read or write access? */
spin_lock(&lock);
ti = task_info_find_pid(pid);
if (ti != NULL) {
ti->timestamp = jiffies;
atomic_inc(&ti->count);
/* TODO: Guess why this comment was added here */
return;
}
spin_unlock(&lock);
/* TODO 1: critical section ends here */

ti = task_info_alloc(pid);
/* TODO 1: protect list access, is this read or write access? */
spin_lock(&lock);
list_add(&ti->list, &head);
spin_unlock(&lock);
/* TODO 1: critical section ends here */
}

void task_info_add_for_current(void)
{
task_info_add_to_list(current->pid);
task_info_add_to_list(current->parent->pid);
task_info_add_to_list(next_task(current)->pid);
task_info_add_to_list(next_task(next_task(current))->pid);
}
EXPORT_SYMBOL(task_info_add_for_current);
/* TODO 2: Export the kernel symbol */

void task_info_print_list(const char *msg)
{
struct list_head *p;
struct task_info *ti;

pr_info("%s: [ ", msg);

/* TODO 1: Protect list, is this read or write access? */
spin_lock(&lock);
list_for_each(p, &head) {
ti = list_entry(p, struct task_info, list);
pr_info("(%d, %lu) ", ti->pid, ti->timestamp);
}
spin_unlock(&lock);
/* TODO 1: Critical section ends here */
pr_info("]\n");
}
EXPORT_SYMBOL(task_info_print_list);
/* TODO 2: Export the kernel symbol */

void task_info_remove_expired(void)
{
struct list_head *p, *q;
struct task_info *ti;

/* TODO 1: Protect list, is this read or write access? */
spin_lock(&lock);
list_for_each_safe(p, q, &head) {
ti = list_entry(p, struct task_info, list);
if (jiffies - ti->timestamp > 3 * HZ && atomic_read(&ti->count) < 5) {
list_del(p);
kfree(ti);
}
}
spin_unlock(&lock);
/* TODO 1: Critical section ends here */
}
EXPORT_SYMBOL(task_info_remove_expired);
/* TODO 2: Export the kernel symbol */

static void task_info_purge_list(void)
{
struct list_head *p, *q;
struct task_info *ti;

/* TODO 1: Protect list, is this read or write access? */
spin_lock(&lock);
list_for_each_safe(p, q, &head) {
ti = list_entry(p, struct task_info, list);
list_del(p);
kfree(ti);
}
spin_unlock(&lock);
/* TODO 1: Critical sections ends here */
}

static int list_sync_init(void)
{
INIT_LIST_HEAD(&head);

task_info_add_for_current();
task_info_print_list("after first add");

set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(5 * HZ);

return 0;
}

static void list_sync_exit(void)
{
struct task_info *ti;

ti = list_entry(head.prev, struct task_info, list);
atomic_set(&ti->count, 10);

task_info_remove_expired();
task_info_print_list("after removing expired");
task_info_purge_list();
}

module_init(list_sync_init);
module_exit(list_sync_exit);
  • 结果:
1
2
3
4
5
6
7
8
9
10
11
12
root@qemux86:~/skels/kernel_api/6-list-sync# insmod list-sync.ko                
list_sync: loading out-of-tree module taints kernel.
after first add: [
(1, 4294905468)
(0, 4294905468)
(213, 4294905468)
(237, 4294905468)
]
root@qemux86:~/skels/kernel_api/6-list-sync# rmmod list-sync.ko
after removing expired: [
(237, 4294905468)
]
  • 一般有循环的地方都要加锁(这里加的是自旋锁)

7.Test module calling in our list module:

  • 从位于目录 6-list-sync/ 的模块中导出各个符号
  • 删除 6-list-sync/ 的模块中所有的锁
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/*
* SO2 lab3 - task 7
*/

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>

MODULE_DESCRIPTION("Test list processing");
MODULE_AUTHOR("SO2");
MODULE_LICENSE("GPL");

extern void task_info_add_for_current(void);
extern void task_info_remove_expired(void);
extern void task_info_print_list(const char *msg);

static int list_test_init(void)
{
/* TODO 1: Uncomment after exporting the symbols in 6-list-sync. */
task_info_add_for_current();
task_info_print_list("after new addition");

return 0;
}

static void list_test_exit(void)
{
/* TODO 1: Uncomment after exporting the symbols in 6-list-sync. */
task_info_remove_expired();
task_info_print_list("after removing expired");
}

module_init(list_test_init);
module_exit(list_test_exit);
  • 结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
root@qemux86:~# insmod ./skels/kernel_api/6-list-sync/list-sync.ko              
list_sync: loading out-of-tree module taints kernel.
after first add: [
(1, 4294904706)
(0, 4294904706)
(213, 4294904706)
(237, 4294904706)
]
root@qemux86:~# insmod ./skels/kernel_api/7-list-test/list-test.ko
after new addition: [
(238, 4294906931)
(1, 4294906931)
(0, 4294906931)
(213, 4294906931)
(237, 4294904706)
]
  • 在加载模块的时候出现了死锁的问题,最后才发现需要把 Synchronizing list work 中的锁删掉
1
2
3
4
5
insmod/238 is trying to acquire lock:                                           
d0847110 (lockA){+.+.}-{2:2}, at: task_info_add_to_list+0x11/0xb0 [list_sync]

but task is already holding lock:
d0847110 (lockA){+.+.}-{2:2}, at: task_info_add_to_list+0x11/0xb0 [list_sync]