0%

深入解析Go

Go 是一种新的语言,一种并发的、带垃圾回收的、快速编译的语言,结合了解释型语言的游刃有余,动态类型语言的开发效率,以及静态类型的安全性

查看二进制文件的 go 语言版本:

1
go version test

数据结构

基本类型

1
2
3
4
5
6
7
8
9
10
11
12
13
package main

import "log"

func main() {
i := 1234
j := int32(1)
f := float32(3.14)
bytes := [5]byte{'h', 'e', 'l', 'l', 'o'}
primes := [4]int{2, 3, 5, 7}

log.Fatalf("%d %d %f %s %v", i, j, f, bytes, primes)
}
  • 基础类型 go 与 c 几乎一致

结构体和指针

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
package main

import "fmt"

type Point struct{ X, Y int }
type Rect1 struct{ Min, Max Point }
type Rect2 struct{ Min, Max *Point }

func main() {
point := Point{1, 2} // 生成结构体
pointp := &Point{1, 2} // 生成结构体,生成结构体指针指向该结构体

fmt.Println(point.X, point.Y, point, pointp.X, pointp.Y, pointp)

r1 := Rect1{Point{1, 2}, Point{3, 4}}
r2 := Rect2{&Point{1, 2}, &Point{3, 4}}
fmt.Println(r1, r2, r2.Max, r2.Min)
}
1
2
1 2 {1 2} 1 2 &{1 2}
{{1 2} {3 4}} {0xc00001e100 0xc00001e110} &{3 4} &{1 2}
  • 不同于 c 语言 go 对指针进行了限制,生成指针后必须为其赋值

字符串

1
2
3
4
5
6
7
8
9
10
package main

import "fmt"

func main() {
s := "hello" // ptr=&"hello",len=5
t := s[2:3] // ptr=&"llo",len=1

fmt.Println(s, t)
}
  • 字符串在 go 语言内存模型中用一个 16 字节的数据结构表示,它包含一个指向字符串存储数据的指针和一个长度数据
  • 因为 string 类型是不可变的,对于多字符串共享同一个存储数据是安全的

切片和数组

1
2
3
4
5
6
7
8
9
10
package main

import "fmt"

func main() {
x := []int{1, 2, 3, 4, 5} // 创建一个包含五个值的数组
y := x[1:3] // 并不分配更多的数据,生成一个新的slice结构来引用相同的存储数据

fmt.Println(x, y)
}
1
[1 2 3 4 5] [2 3]
  • 切片是对数组的一个连续片段的引用,片段可以是整个数组,也可以是数组的一部分
  • 在内存中,它是一个包含3个域的结构体:(数组的 slice 并不会实际复制一份数据,它只是创建一个新的数据结构)
    • 指向 slice 中第一个元素的指针
    • slice 的长度(下标操作的上界)
    • slice 的容量(分割操作的上界)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
package main

import "fmt"

func fun1(x []int) {
x[0] = 100
x = append(x, 6) /* 发生扩容,生成新的空间 */
x[1] = 200
fmt.Println(x)
}

func main() {
x := []int{1, 2, 3, 4, 5}
fmt.Println(x)

fun1(x)
fmt.Println(x)
}
1
2
3
[1 2 3 4 5]
[100 200 3 4 5 6]
[100 2 3 4 5]
  • 在对 slice/array 进行 append 等操作时,可能会造成 slice 的自动扩容规则为:
    • 如果新的大小是当前大小2倍以上,则大小增长为新大小
    • 否则循环以下操作:如果当前大小小于1024,按每次2倍增长,否则每次按当前大小1/4增长,直到增长的大小超过或等于新大小
  • 扩容时生成了新的空间,导致 x[1] = 200 没有写入 main 中的数组
  • 另外 main 中的 x 和 fun1 中的 x 是两个不同的对象,它们指向同一个数组但却拥有彼此独立的长度(main 中的 x 长度为 “5”,fun1 中的 x 长度为 “6”)

映射

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
package main

import "fmt"

func main() {
var m = make(map[string]int)

fmt.Printf("Map len = %d\n", len(m))

m["zero"] = 0
m["one"] = 1
m["two"] = 2

fmt.Printf("Map len = %d\n", len(m))

fmt.Printf("zero = %T, %v\n", m["zero"], m["zero"])
fmt.Printf("one = %T, %v\n", m["one"], m["one"])
fmt.Printf("two = %T, %v\n", m["two"], m["two"])
}
1
2
3
4
5
Map len = 0
Map len = 3
zero = int, 0
one = int, 1
two = int, 2
  • Map 是一种键值对的无序集合(go 中的 map 在底层是用哈希表实现的)

动态内存分配 make 和 new

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
package main

import (
"fmt"
"sync"
)

func main() {
u := new(user)
s := make([]int, 5, 100)

fmt.Println(u)
fmt.Println(s)
}

type user struct {
lock sync.Mutex
name string
age int
}

对于动态内存申请,go 有两个不同的关键字 make 和 new:

  • make 的作用是初始化内置的数据结构(slice ,map,channel)
  • new 的作用是根据传入的类型分配一片内存空间并返回指向这片内存空间的指针
1
2
3
4
0x4804e0 <main.main+32>    call   40c100h                       <runtime[newobject]> /* new */

RAX 0x48f5e0 (type:*+58848) ◂— 0x20 /* ' ' */
RBX 0x0
1
2
3
4
5
0x480500 <main.main+64>    call   445fa0h                       <runtime[makeslice]> /* make */

RAX 0x487e80 (type:*+28288) ◂— 0x8
RBX 0x5
RCX 0x64

函数调用

调用约定

1
2
3
4
5
6
7
8
9
10
11
12
13
package main

import (
"fmt"
)

func addtest(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15 uint64) {
fmt.Println(a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10 + a11 + a12 + a13 + a14 + a15)
}

func main() {
addtest(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
}
1
2
3
4
5
6
0x4805f8 <main.main+24>    mov    qword ptr [rsp], 0ah
0x480600 <main.main+32> mov qword ptr [rsp + 8], 0bh
0x480609 <main.main+41> mov qword ptr [rsp + 10h], 0ch
0x480612 <main.main+50> mov qword ptr [rsp + 18h], 0dh
0x48061b <main.main+59> mov qword ptr [rsp + 20h], 0eh
0x480624 <main.main+68> mov qword ptr [rsp + 28h], 0fh
1
2
3
4
5
6
0x48062d <main.main+77>     mov    eax, 1                        <main.main>
0x480632 <main.main+82> mov ebx, 2
0x480637 <main.main+87> mov ecx, 3
0x48063c <main.main+92> mov edi, 4
0x480641 <main.main+97> mov esi, 5
0x480646 <main.main+102> mov r8d, 6
  • 前9个参数进入寄存器,后续参数存放入栈中
1
2
3
4
5
6
00:0000│ rsp 0xc00007cef0 —▸ 0x480665 (main.main+133) ◂— mov    rbp, qword ptr [rsp + 78h]
01:00080xc00007cef8 ◂— 0xa /* '\n' */
02:00100xc00007cf00 ◂— 0xb /* '\x0b' */
03:00180xc00007cf08 ◂— 0xc /* '\x0c' */
04:00200xc00007cf10 ◂— 0xd /* '\r' */
05:00280xc00007cf18 ◂— 0xe

多值返回

1
2
3
4
5
6
7
8
9
10
11
12
13
package main

import (
"fmt"
)

func retest() (int, int, int) {
return 1, 2, 3
}

func main() {
fmt.Println(retest())
}
1
2
3
4
5
6
7
8
9
RAX  0x498948 (go:string.*+1328) ◂— 0x3131313131313131 ('11111111')
RBX 0x7
RCX 0x498956 (go:string.*+1342) ◂— 0x3332323232323232 ('22222223')
RDX 0x8
RDI 0x7
RSI 0x49895d (go:string.*+1349) ◂— 0x3933333333333333 ('33333339')
R8 0x7
R9 0x0
R10 0x60
  • 返回值会使用9个寄存器,剩下的存储在栈中

闭包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
package main

import "fmt"

func incSeq() func() int {
var i = 0
return func() int { /* 这个函数中本身没有定义变量i,而是引用了它所在的环境中的变量i */
i++
return i
}
}

func main() {
next := incSeq()

fmt.Printf("start = %d\n", next())

for i := 1; i <= 5; i++ {
fmt.Printf("index(%d) = %d\n", i, next())
}
}
1
2
3
4
5
6
start = 1
index(1) = 2
index(2) = 3
index(3) = 4
index(4) = 5
index(5) = 6
  • 闭包是由函数及其相关引用环境组合而成的实体
  • 返回闭包时并不是单纯返回一个函数,而是返回了一个结构体,记录有函数的返回地址和引用环境中的变量地址
  • 最后还有一个小问题,var i = 0 原本应该分配到栈上,但 go 编译器会自动识别这种情况并将其分配到堆上
1
2
3
4
5
6
7
8
9
10
11
12
13
14
package main

func incSeq() func() int {
var i = 0
return func() int {
i++
return i
}
}

func main() {
next := incSeq()
next()
}
1
go build --gcflags=-m test.go
  • 使用 escape analyze
1
2
3
4
5
6
7
8
9
# command-line-arguments
./test.go:3:6: can inline incSeq
./test.go:5:9: can inline incSeq.func1
./test.go:12:16: inlining call to incSeq
./test.go:5:9: can inline main.func1
./test.go:13:6: inlining call to main.func1
./test.go:4:6: moved to heap: i /* 被转移到堆空间 */
./test.go:5:9: func literal escapes to heap
./test.go:12:16: func literal does not escape

关键字-go

go 语言支持并发,我们只需要通过 go 关键字来开启 goroutine(协程,轻量级线程)即可

  • 协程:子程序调用总是一个入口,一次返回,一旦退出即完成了子程序的执行
1
2
3
4
5
6
7
8
9
10
11
12
package main

import "fmt"

func gotest(a, b, c int) int {
fmt.Println(a, b, c)
return a + b + c
}

func main() {
go gotest(1, 2, 3)
}
  • go 关键字的底层其实是调用 runtime.newproc 函数
1
0x4805e0 <main.main+32>    call   43bc80h                       <runtime[newproc]>
1
2
3
RAX  0x4a0970 (go:func.*+552) —▸ 0x480600 (main.main.func1) ◂— cmp    rsp, qword ptr [r14 + 10h]
RBX 0x0
RCX 0x0
  • 函数 runtime.newproc 负责启动一个新的线程(协程),新建一个栈空间,将栈参数拷贝到新栈空间中并让栈指针指向参数
  • 另外编译器会为 gotest 生成一个辅助函数,IDA 分析如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
void __golang main_main_func1()
{
__int64 v0; // rbp
int v1; // r14
char **v2; // r12
__int64 v3[4]; // [rsp-18h] [rbp-20h] BYREF
void *retaddr; // [rsp+8h] [rbp+0h] BYREF
char v5; // [rsp+10h] [rbp+8h] BYREF

if ( (unsigned int)&retaddr <= *(_QWORD *)(v1 + 16LL) )
runtime_morestack_noctxt();
v3[3LL] = v0;
v2 = *(char ***)(v1 + 32LL);
if ( v2 && *v2 == &v5 )
*v2 = (char *)v3;
main_gotest(v3[0LL], v3[1LL], v3[2LL]); /* 执行原函数 */
}

关键字-defer

defer 用于资源的释放,会在函数返回之前进行调用(defer 语句保证了不论是在正常情况下,还是非正常情况下,函数或方法都能够执行)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
package main

func A() {
defer println("1")
defer func() {
defer println("2")
}()
defer println("3")
println("start")
}

func main() {
A()
}
1
2
3
4
start
3
2
1
  • 在 return 之前,程序会调用 defer 表达式
  • 如果有多个 defer 表达式,调用顺序类似于栈,越后面的 defer 表达式越先被调用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
package main

import "fmt"

func f() (result int) {
defer func() {
result++
}()
return 0
}
func f2() (r int) {
t := 5
defer func() {
t = t + 5
}()
return t
}
func f3() (r int) {
defer func(r int) {
r = r + 5
}(r)
return 1
}
func main() {
fmt.Println(f())
fmt.Println(f2())
fmt.Println(f3())
}
1
2
3
1
5
1
  • return 这一条语句并不是一条原子指令,它拥有赋值和返回两个步骤,而 defer 语句则会在这两个步骤之间运行
  • 因此上面的样例可以等价修改成下面的代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package main

import "fmt"

func f() (result int) {
result = 0
defer func() {
result++
}()
return /* 返回值为'1' */
}
func f2() (r int) {
t := 5
r = t
defer func() {
t = t + 5
}()
return /* 返回值为'5' */
}
func f3() (r int) {
r = 1
defer func(r int) {
r = r + 5
}(r)
return /* 返回值为'1' */
}
func main() {
fmt.Println(f())
fmt.Println(f2())
fmt.Println(f3())
}
  • 编译器会先为每个 defer 语句生成一个辅助函数,然后在返回值赋值以后函数执行 ret 指令之前调用该函数
1
0x458096 <main[A]+86>     call   458100h                       <main.A.func1>

分段栈和连续栈

goroutine 可以初始时只给栈分配很小的空间,然后随着使用过程中的需要自动地增长

  • 每次执行函数调用时 Go 的 runtime 都会进行检测,若当前栈的大小不够用,则会触发“中断”,从当前函数进入到 Go 的运行时库
  • 然后分配一个新的足够大的栈空间,接下来的处理有不同的策略

在 IDA 的伪代码中经常可以看到如下代码:

1
2
if ( (unsigned int)&retaddr <= *(_QWORD *)(v1 + 16LL) )
runtime_morestack_noctxt();
  • 函数 morestack_noctxt 用于扩展栈

在 go-1.3 版本之前,使用的栈结构是分段栈:

  • 随着 goroutine 调用的函数层级的深入或者局部变量需要的越来越多时,栈空间可能会出现不够用的情况
  • 在运行时会调用 runtime.morestack 和 runtime.newstack 创建一个新的栈空间,这些栈空间是不连续的,当前 goroutine 的多个栈空间会以双向链表的形式串联起来,运行时会通过指针找到各个栈片段
  • 当调用回溯的时候,不再使用的栈空间将会被系统回收

但分段栈有一个问题,如果当前 goroutine 的栈几乎充满,那么任意的函数调用都会触发栈的扩容,当函数返回后又会触发栈的收缩,如果在一个循环中调用函数,栈的分配和释放就会造成巨大的额外开销,这被称为热分裂问题

连续栈可以解决分段栈中存在的两个问题:

  • 其核心原理就是每当程序的栈空间不足时,初始化一片比旧栈大两倍的新栈并将原栈中的所有值都迁移到新的栈中,新的局部变量或者函数调用就有了充足的内存空间

使用连续栈机制时,栈空间不足导致的扩容会经历以下几个步骤:

  • 调用用 runtime.newstack 在内存空间中分配更大的栈内存空间
  • 使用 runtime.copystack 将旧栈中的所有内容复制到新的栈中
  • 将指向旧栈对应变量的指针重新指向新栈
  • 调用 runtime.stackfree 销毁并回收旧栈的内存空间

系统调用

go 的 syscall 库中提供了对系统调用的封装,它会在真正执行系统调用之前先调用函数 entersyscall,并在系统调用函数返回后调用 exitsyscall 函数

1
2
3
4
5
6
7
func syscall_syscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
args := struct{ fn, a1, a2, a3, r1, r2, err uintptr }{fn, a1, a2, a3, r1, r2, err}
entersyscall()
libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall)), unsafe.Pointer(&args))
exitsyscall()
return args.r1, args.r2, args.err
}

这两个函数就是通知 go 的运行时库这个 goroutine 进入了系统调用或者完成了系统调用,调度器会做相应的调度

  • entersyscall:
    • 把当前 M 的 P 设置为 _Psyscall 状态,打上标识解绑 P -> M 的绑定,但 M 还保留 P 的指针
  • existsyscall:
    • 由于 M 到 P 的指向还在,那么优先还是用原来的 P,如果原来的 P 被处理掉了,那么就去用一个新的 P,如果还没有,那就只能把 G 挂到全局队列了
    • Go 的 sysmon(内部监控线程)发现有这种卡了超过 10 ms 的 M ,那么就会把 P 剥离出来,给到其他的 M 去处理执行,M 数量不够就会新创建

协程调度

go 的调度的实现,涉及到几个重要的数据结构,运行时库用这几个数据结构来实现 goroutine 的调度,管理 goroutine 和物理线程的运行,这些数据结构分别是结构体G,结构体M,结构体P,以及Sched结构体

结构体G

G 是 goroutine 的缩写,相当于操作系统中的进程控制块,在这里就是 goroutine 的控制结构,是对 goroutine 的抽象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
type g struct {
stack stack // offset known to runtime/cgo
/* type stack struct {
lo uintptr // 该协程拥有的栈低位
hi uintptr // 该协程拥有的栈高位
} */
stackguard0 uintptr // 检查栈空间是否足够的值,低于这个值会扩张栈
stackguard1 uintptr // 检查栈空间是否足够的值,低于这个值会扩张栈

_panic *_panic // innermost panic - offset known to liblink
_defer *_defer // innermost defer
m *m // current m; offset known to arm liblink
sched gobuf // 用于记录协程切换的上下文
syscallsp uintptr // if status==Gsyscall, syscallsp = sched.sp to use during gc
syscallpc uintptr // if status==Gsyscall, syscallpc = sched.pc to use during gc
stktopsp uintptr // expected sp at top of stack, to check in traceback
// param is a generic pointer parameter field used to pass
// values in particular contexts where other storage for the
// parameter would be difficult to find. It is currently used
// in four ways:
// 1. When a channel operation wakes up a blocked goroutine, it sets param to
// point to the sudog of the completed blocking operation.
// 2. By gcAssistAlloc1 to signal back to its caller that the goroutine completed
// the GC cycle. It is unsafe to do so in any other way, because the goroutine's
// stack may have moved in the meantime.
// 3. By debugCallWrap to pass parameters to a new goroutine because allocating a
// closure in the runtime is forbidden.
// 4. When a panic is recovered and control returns to the respective frame,
// param may point to a savedOpenDeferState.
param unsafe.Pointer // 用于传递参数,睡眠时其它goroutine设置param,唤醒时此goroutine可以获取
atomicstatus atomic.Uint32
stackLock uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
goid uint64 // goroutine的id号
schedlink guintptr
waitsince int64 // approx time when the g become blocked
waitreason waitReason // if status==Gwaiting

preempt bool // preemption signal, duplicates stackguard0 = stackpreempt
preemptStop bool // transition to _Gpreempted on preemption; otherwise, just deschedule
preemptShrink bool // shrink stack at synchronous safe point

// asyncSafePoint is set if g is stopped at an asynchronous
// safe point. This means there are frames on the stack
// without precise pointer information.
asyncSafePoint bool

paniconfault bool // panic (instead of crash) on unexpected fault address
gcscandone bool // g has scanned stack; protected by _Gscan bit in status
throwsplit bool // must not split stack
// activeStackChans indicates that there are unlocked channels
// pointing into this goroutine's stack. If true, stack
// copying needs to acquire channel locks to protect these
// areas of the stack.
activeStackChans bool
// parkingOnChan indicates that the goroutine is about to
// park on a chansend or chanrecv. Used to signal an unsafe point
// for stack shrinking.
parkingOnChan atomic.Bool
// inMarkAssist indicates whether the goroutine is in mark assist.
// Used by the execution tracer.
inMarkAssist bool
coroexit bool // argument to coroswitch_m

raceignore int8 // ignore race detection events
nocgocallback bool // whether disable callback from C
tracking bool // whether we're tracking this G for sched latency statistics
trackingSeq uint8 // used to decide whether to track this G
trackingStamp int64 // timestamp of when the G last started being tracked
runnableTime int64 // the amount of time spent runnable, cleared when running, only used when tracking
lockedm muintptr // G被锁定只能在这个M上运行
sig uint32
writebuf []byte
sigcode0 uintptr
sigcode1 uintptr
sigpc uintptr
parentGoid uint64 // 父类goroutine的goid
gopc uintptr // 创建这个goroutine的go表达式的pc
ancestors *[]ancestorInfo // ancestor information goroutine(s) that created this goroutine (only used if debug.tracebackancestors)
startpc uintptr // pc of goroutine function
racectx uintptr
waiting *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
cgoCtxt []uintptr // cgo traceback context
labels unsafe.Pointer // profiler labels
timer *timer // cached timer for time.Sleep
sleepWhen int64 // when to sleep until
selectDone atomic.Uint32 // are we participating in a select and did someone win the race?

coroarg *coro // argument during coroutine transfers

// goroutineProfiled indicates the status of this goroutine's stack for the
// current in-progress goroutine profile
goroutineProfiled goroutineProfileStateHolder

// Per-G tracer state.
trace gTraceState

// Per-G GC state

// gcAssistBytes is this G's GC assist credit in terms of
// bytes allocated. If this is positive, then the G has credit
// to allocate gcAssistBytes bytes without assisting. If this
// is negative, then the G must correct this by performing
// scan work. We track this in bytes to make it fast to update
// and check for debt in the malloc hot path. The assist ratio
// determines how this corresponds to scan work debt.
gcAssistBytes int64
}
  • goroutine 切换时,上下文信息保存在结构体的 sched 域中,goroutine 切换时并不必陷入到操作系统内核中

结构体M

M 是 machine 的缩写,是对机器的抽象,每个 M 都是对应到一条操作系统的物理线程

  • M 必须关联了 P 才可以执行 go 代码
  • 当它处理阻塞或者系统调用时,可以不需要关联 P
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
type m struct {
g0 *g // 带有调度栈的goroutine
morebuf gobuf // gobuf arg to morestack
divmod uint32 // div/mod denominator for arm - known to liblink
_ uint32 // align next field to 8 bytes

// Fields not known to debuggers.
procid uint64 // for debuggers, but offset not hard-coded
gsignal *g // 关联P以执行Go代码
goSigStack gsignalStack // Go-allocated signal handling stack
sigmask sigset // storage for saved signal mask
tls [tlsSlots]uintptr // thread-local storage (for x86 extern register)
mstartfn func()
curg *g // M中当前运行的goroutine
caughtsig guintptr // goroutine running during fatal signal
p puintptr // attached p for executing go code (nil if not executing go code)
nextp puintptr
oldp puintptr // the p that was attached before executing a syscall
id int64
mallocing int32 // 状态
throwing throwType
preemptoff string // if != "", keep curg running on this m
locks int32
dying int32
profilehz int32
spinning bool // m is out of work and is actively looking for work
blocked bool // m is blocked on a note
newSigstack bool // minit on C thread called sigaltstack
printlock int8
incgo bool // m is executing a cgo call
isextra bool // m is an extra m
isExtraInC bool // m is an extra m that is not executing Go code
isExtraInSig bool // m is an extra m in a signal handler
freeWait atomic.Uint32 // Whether it is safe to free g0 and delete m (one of freeMRef, freeMStack, freeMWait)
needextram bool
traceback uint8
ncgocall uint64 // number of cgo calls in total
ncgo int32 // number of cgo calls currently in progress
cgoCallersUse atomic.Uint32 // if non-zero, cgoCallers in use temporarily
cgoCallers *cgoCallers // cgo traceback if crashing in cgo call
park note
alllink *m // 用于链接allm
schedlink muintptr
lockedg guintptr
createstack [32]uintptr // stack that created this thread, it's used for StackRecord.Stack0, so it must align with it.
lockedExt uint32 // tracking for external LockOSThread
lockedInt uint32 // tracking for internal lockOSThread
nextwaitm muintptr // next m waiting for lock

mLockProfile mLockProfile // fields relating to runtime.lock contention

// wait* are used to carry arguments from gopark into park_m, because
// there's no stack to put them on. That is their sole purpose.
waitunlockf func(*g, unsafe.Pointer) bool
waitlock unsafe.Pointer
waitTraceBlockReason traceBlockReason
waitTraceSkip int

syscalltick uint32
freelink *m // on sched.freem
trace mTraceState

// these are here because they are too large to be on the stack
// of low-level NOSPLIT functions.
libcall libcall
libcallpc uintptr // for cpu profiler
libcallsp uintptr
libcallg guintptr
syscall libcall // stores syscall parameters on windows

vdsoSP uintptr // SP for traceback while in VDSO call (0 if not in call)
vdsoPC uintptr // PC for traceback while in VDSO call

// preemptGen counts the number of completed preemption
// signals. This is used to detect when a preemption is
// requested, but fails.
preemptGen atomic.Uint32

// Whether this is a pending preemption signal on this M.
signalPending atomic.Uint32

// pcvalue lookup cache
pcvalueCache pcvalueCache

dlogPerM

mOS

chacha8 chacha8rand.State
cheaprand uint64

// Up to 10 locks held by this m, maintained by the lock ranking code.
locksHeldLen int
locksHeld [10]heldLockInfo
}
  • 普通的 goroutine 的栈是在堆上分配的可增长的栈,而 g0 的栈是 M 对应的线程的栈
  • 所有调度相关的代码,会先切换到该 goroutine 的栈中再执行

结构体P

P 是 Processor 逻辑处理器的缩写,每个 P 拥有一个本地队列并为 G 在 M 上的运行提供本地化资源

  • M 代表 OS 线程,P 代表 go 代码执行时需要的资源,当 M 执行 go 代码时,它需要关联一个 P
  • 所有的 P 被组织为一个数组,在 P 上实现了工作流窃取的调度器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
type p struct {
id int32
status uint32 // one of pidle/prunning/...
link puintptr
schedtick uint32 // 每次调度时将它+1
syscalltick uint32 // incremented on every system call
sysmontick sysmontick // last tick observed by sysmon
m muintptr // back-link to associated m (nil if idle)
mcache *mcache // 系统线程缓存
pcache pageCache
raceprocctx uintptr

deferpool []*_defer // pool of available defer structs (see panic.go)
deferpoolbuf [32]*_defer

// Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
goidcache uint64
goidcacheend uint64

// Queue of runnable goroutines. Accessed without lock.
runqhead uint32
runqtail uint32
runq [256]guintptr
// runnext, if non-nil, is a runnable G that was ready'd by
// the current G and should be run next instead of what's in
// runq if there's time remaining in the running G's time
// slice. It will inherit the time left in the current time
// slice. If a set of goroutines is locked in a
// communicate-and-wait pattern, this schedules that set as a
// unit and eliminates the (potentially large) scheduling
// latency that otherwise arises from adding the ready'd
// goroutines to the end of the run queue.
//
// Note that while other P's may atomically CAS this to zero,
// only the owner P can CAS it to a valid G.
runnext guintptr

// Available G's (status == Gdead)
gFree struct {
gList
n int32
}

sudogcache []*sudog
sudogbuf [128]*sudog

// Cache of mspan objects from the heap.
mspancache struct {
// We need an explicit length here because this field is used
// in allocation codepaths where write barriers are not allowed,
// and eliminating the write barrier/keeping it eliminated from
// slice updates is tricky, more so than just managing the length
// ourselves.
len int
buf [128]*mspan
}

// Cache of a single pinner object to reduce allocations from repeated
// pinner creation.
pinnerCache *pinner

trace pTraceState

palloc persistentAlloc // per-P to avoid mutex

// Per-P GC state
gcAssistTime int64 // Nanoseconds in assistAlloc
gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker (atomic)

// limiterEvent tracks events for the GC CPU limiter.
limiterEvent limiterEvent

// gcMarkWorkerMode is the mode for the next mark worker to run in.
// That is, this is used to communicate with the worker goroutine
// selected for immediate execution by
// gcController.findRunnableGCWorker. When scheduling other goroutines,
// this field must be set to gcMarkWorkerNotWorker.
gcMarkWorkerMode gcMarkWorkerMode
// gcMarkWorkerStartTime is the nanotime() at which the most recent
// mark worker started.
gcMarkWorkerStartTime int64

// gcw is this P's GC work buffer cache. The work buffer is
// filled by write barriers, drained by mutator assists, and
// disposed on certain GC state transitions.
gcw gcWork

// wbBuf is this P's GC write barrier buffer.
//
// TODO: Consider caching this in the running G.
wbBuf wbBuf

runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point

// statsSeq is a counter indicating whether this P is currently
// writing any stats. Its value is even when not, odd when it is.
statsSeq atomic.Uint32

// Timer heap.
timers timers

// maxStackScanDelta accumulates the amount of stack space held by
// live goroutines (i.e. those eligible for stack scanning).
// Flushed to gcController.maxStackScan once maxStackScanSlack
// or -maxStackScanSlack is reached.
maxStackScanDelta int64

// gc-time statistics about current goroutines
// Note that this differs from maxStackScan in that this
// accumulates the actual stack observed to be used at GC time (hi - sp),
// not an instantaneous measure of the total stack size that might need
// to be scanned (hi - lo).
scannedStackSize uint64 // stack size of goroutines scanned by this P
scannedStacks uint64 // number of goroutines scanned by this P

// preempt is set to indicate that this P should be enter the
// scheduler ASAP (regardless of what G is running on it).
preempt bool

// pageTraceBuf is a buffer for writing out page allocation/free/scavenge traces.
//
// Used only if GOEXPERIMENT=pagetrace.
pageTraceBuf pageTraceBuf

// Padding is no longer needed. False sharing is now not a worry because p is large enough
// that its size class is an integer multiple of the cache line size (for any of our architectures).
}
  • 在 P 中有一个 Grunnable 的 goroutine 队列,这是一个 P 的局部队列
  • 当 P 执行 go 代码时,它会优先从自己的这个局部队列中取,这时可以不用加锁,提高了并发度
  • 如果发现这个队列空了,则去其它 P 的队列中拿一半过来,这样实现工作流窃取的调度(这种情况下是需要给调用器加锁的)

结构体 Sched

Sched 是调度实现中使用的数据结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
type schedt struct {
goidgen atomic.Uint64
lastpoll atomic.Int64 // time of last network poll, 0 if currently polling
pollUntil atomic.Int64 // time to which current poll is sleeping

lock mutex

// When increasing nmidle, nmidlelocked, nmsys, or nmfreed, be
// sure to call checkdead().

midle muintptr // idle m's waiting for work
nmidle int32 // number of idle m's waiting for work
nmidlelocked int32 // number of locked m's waiting for work
mnext int64 // number of m's that have been created and next M ID
maxmcount int32 // maximum number of m's allowed (or die)
nmsys int32 // number of system m's not counted for deadlock
nmfreed int64 // cumulative number of freed m's

ngsys atomic.Int32 // number of system goroutines

pidle puintptr // idle p's
npidle atomic.Int32 // idle P的数量
nmspinning atomic.Int32 // See "Worker thread parking/unparking" comment in proc.go.
needspinning atomic.Uint32 // See "Delicate dance" comment in proc.go. Boolean. Must hold sched.lock to set to 1.

// Global runnable queue.
runq gQueue
runqsize int32

// disable controls selective disabling of the scheduler.
//
// Use schedEnableUser to control this.
//
// disable is protected by sched.lock.
disable struct {
// user disables scheduling of user goroutines.
user bool
runnable gQueue // pending runnable Gs
n int32 // length of runnable
}

// Global cache of dead G's.
gFree struct {
lock mutex
stack gList // Gs with stacks
noStack gList // Gs without stacks
n int32
}

// Central cache of sudog structs.
sudoglock mutex
sudogcache *sudog

// Central pool of available defer structs.
deferlock mutex
deferpool *_defer

// freem is the list of m's waiting to be freed when their
// m.exited is set. Linked through m.freelink.
freem *m

gcwaiting atomic.Bool // gc is waiting to run
stopwait int32
stopnote note
sysmonwait atomic.Bool
sysmonnote note

// safePointFn should be called on each P at the next GC
// safepoint if p.runSafePointFn is set.
safePointFn func(*p)
safePointWait int32
safePointNote note

profilehz int32 // cpu profiling rate

procresizetime int64 // nanotime() of last change to gomaxprocs
totaltime int64 // ∫gomaxprocs dt up to procresizetime

// sysmonlock protects sysmon's actions on the runtime.
//
// Acquire and hold this mutex to block sysmon from interacting
// with the rest of the runtime.
sysmonlock mutex

// timeToRun is a distribution of scheduling latencies, defined
// as the sum of time a G spends in the _Grunnable state before
// it transitions to _Grunning.
timeToRun timeHistogram

// idleTime is the total CPU time Ps have "spent" idle.
//
// Reset on each GC cycle.
idleTime atomic.Int64

// totalMutexWaitTime is the sum of time goroutines have spent in _Gwaiting
// with a waitreason of the form waitReasonSync{RW,}Mutex{R,}Lock.
totalMutexWaitTime atomic.Int64

// stwStoppingTimeGC/Other are distributions of stop-the-world stopping
// latencies, defined as the time taken by stopTheWorldWithSema to get
// all Ps to stop. stwStoppingTimeGC covers all GC-related STWs,
// stwStoppingTimeOther covers the others.
stwStoppingTimeGC timeHistogram
stwStoppingTimeOther timeHistogram

// stwTotalTimeGC/Other are distributions of stop-the-world total
// latencies, defined as the total time from stopTheWorldWithSema to
// startTheWorldWithSema. This is a superset of
// stwStoppingTimeGC/Other. stwTotalTimeGC covers all GC-related STWs,
// stwTotalTimeOther covers the others.
stwTotalTimeGC timeHistogram
stwTotalTimeOther timeHistogram

// totalRuntimeLockWaitTime (plus the value of lockWaitTime on each M in
// allm) is the sum of time goroutines have spent in _Grunnable and with an
// M, but waiting for locks within the runtime. This field stores the value
// for Ms that have exited.
totalRuntimeLockWaitTime atomic.Int64
}
  • 其中有 M 的 idle 队列,P 的 idle 队列,以及一个全局的就绪的 G 队列

G-P-M 模型

G-P-M 模型是基于线程池演化而来:

  • 把每个工作线程叫 worker 的话,每条线程运行一个 worker
  • 每个 worker 做的事情就是不停地从队列中取出任务并执行

在 G-P-M 模型中:

  • G 就是我们需要完成的任务
  • M 就是一个 worker(一条线程)
  • Sched 相当于管理可运行 G 的全局任务队列(当然也包括了其他的辅助信息)
  • P 则是在 go-1.1 中才引入的内容,为了解决 G 阻塞导致的 M 资源浪费问题

G-P-M 模型图解:

  • G:goroutine 协程
    • 通过 go 关键字创建,封装了所要执行的代码逻辑
    • 属于用户级资源,对 OS 透明,具备轻量级,可以大量创建,上下文切换成本地等特点
  • P:Processor 逻辑处理器
    • 默认 go 运行时的 Processor 数量等于 CPU 数量,也可以通过 GOMAXPROCS 函数指定 P 的数量
    • P 的主要作用是管理 G 运行,每个 P 拥有一个本地队列并为 G 在 M 上的运行提供本地化资源
  • M:Machine 操作系统创建的系统线程
    • 作用是执行 G 中包装的并行任务,被称为物理处理器
    • 其属于 OS 资源,可以创建的数量上也受限与 OS,通常情况下 G 的数量都多于活跃的 M
    • go 运行时调度器将 G 公平合理的安排到多个 M 上去执行
  • G和M的关系:
    • G是要执行的任务,M是具体执行G的工作线程,通过P建立G和M的联系从而执行
  • G和P的关系:
    • P是G的管理者,P将G交由M执行,并管理一定系统资源供G使用,一个P管理存储在其本地队列的所有G(P和G是1:n的关系)
  • P和M的关系:
    • P将管理的G交由M具体执行,当遇到阻塞时,P可以与M解绑,并找到空闲的M进行绑定继续执行队列中其他可执行的G(P和M是1:1的关系)

P 会从 Sched 中拿取 G 并加入自己的任务队列(在该任务队列中使用轻量级切换,不涉及内核),然后 P 将自己任务队列中的 G 交给 M 执行,一旦 G 阻塞,P 就会与 M 解除绑定,并寻找空闲的 M 继续执行任务队列中的 G,如果 P 中的任务队列全部执行完毕,则 P 会随机从其它 P 中窃取一半的可运行的 G

创建&销毁 goroutine

函数 runtime.newproc 会创建一个新的 G 结构体(核心工作由 runtime.newproc1 完成)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
func newproc1(fn *funcval, callergp *g, callerpc uintptr, parked bool, waitreason waitReason) *g {
if fn == nil {
fatal("go of nil func value")
}

mp := acquirem() /* 获取当前的结构体M */
pp := mp.p.ptr() /* 获取当前结构体M的P队列 */
newg := gfget(pp) /* 查找是否有可用的结构体G */
if newg == nil {
newg = malg(stackMin) /* 创建一个拥有StackMin大小的栈的g */
casgstatus(newg, _Gidle, _Gdead) /* 将新创建的g从_Gidle更新为_Gdead状态 */
allgadd(newg) /* 将它挂到runtime的相关队列(allg)中 */
}
if newg.stack.hi == 0 {
throw("newproc1: newg missing stack")
}

if readgstatus(newg) != _Gdead {
throw("newproc1: new g is not Gdead")
}

totalSize := uintptr(4*goarch.PtrSize + sys.MinFrameSize) // extra space in case of reads slightly beyond frame
totalSize = alignUp(totalSize, sys.StackAlign)
sp := newg.stack.hi - totalSize
if usesLR {
// caller's LR
*(*uintptr)(unsafe.Pointer(sp)) = 0
prepGoExitFrame(sp)
}
if GOARCH == "arm64" {
// caller's FP
*(*uintptr)(unsafe.Pointer(sp - goarch.PtrSize)) = 0
}

memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
newg.sched.sp = sp /* 将sp,pc等上下文环境保存在g的sched域 */
newg.stktopsp = sp
newg.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
newg.sched.g = guintptr(unsafe.Pointer(newg))
gostartcallfn(&newg.sched, fn)
newg.parentGoid = callergp.goid
newg.gopc = callerpc
newg.ancestors = saveAncestors(callergp)
newg.startpc = fn.fn
if isSystemGoroutine(newg, false) {
sched.ngsys.Add(1)
} else {
// Only user goroutines inherit pprof labels.
if mp.curg != nil {
newg.labels = mp.curg.labels
}
if goroutineProfile.active {
// A concurrent goroutine profile is running. It should include
// exactly the set of goroutines that were alive when the goroutine
// profiler first stopped the world. That does not include newg, so
// mark it as not needing a profile before transitioning it from
// _Gdead.
newg.goroutineProfiled.Store(goroutineProfileSatisfied)
}
}
// Track initial transition?
newg.trackingSeq = uint8(cheaprand())
if newg.trackingSeq%gTrackingPeriod == 0 {
newg.tracking = true
}
gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo)) /* 分配goroutine id */

// Get a goid and switch to runnable. Make all this atomic to the tracer.
trace := traceAcquire()
var status uint32 = _Grunnable
if parked {
status = _Gwaiting
newg.waitreason = waitreason
}
casgstatus(newg, _Gdead, status)

/* 将初始化完成的结构体G,挂到当前M的P的队列中 */
if pp.goidcache == pp.goidcacheend {
// Sched.goidgen is the last allocated id,
// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
// At startup sched.goidgen=0, so main goroutine receives goid=1.
pp.goidcache = sched.goidgen.Add(_GoidCacheBatch)
pp.goidcache -= _GoidCacheBatch - 1
pp.goidcacheend = pp.goidcache + _GoidCacheBatch
}
newg.goid = pp.goidcache
pp.goidcache++
newg.trace.reset()
if trace.ok() {
trace.GoCreate(newg, newg.startpc, parked)
traceRelease(trace)
}

// Set up race context.
if raceenabled {
newg.racectx = racegostart(callerpc)
newg.raceignore = 0
if newg.labels != nil {
// See note in proflabel.go on labelSync's role in synchronizing
// with the reads in the signal handler.
racereleasemergeg(newg, unsafe.Pointer(&labelSync))
}
}
releasem(mp)

return newg
}

wakep 函数唤醒 P 时,调度器会试着寻找一个可用的 M 来绑定 P,必要的时候会新建 M,之后的调用链如下:

  • newproc -> newproc1 -> wakep(如果P数目没到上限) -> startm -> newm -> newosproc -> mstart(线程入口) -> schedule -> execute -> goroutine 协程运行
  • execute 会恢复 newproc1 中设置的上下文,这样就跳转到新的 goroutine 去执行了

当 fnstart 函数执行完返回时,它会返回到 runtime.exit 中,这时 runtime.exit 中会做一些回收工作,会将 G 的状态设置为 Gdead 等,并将 G 挂到 P 的 free 队列中

抢占式 goroutine

go 只是引入了一些很初级的抢占,并没有像操作系统调度那么复杂,没有对 goroutine 分时间片,设置优先级等,只有长时间阻塞于系统调用,或者运行了较长时间才会被抢占

runtime 开了一条后台线程,运行一个 sysmon 函数,这个函数会周期性地做 epoll 操作,同时它还会检测每个 P 是否运行了较长时间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
func sysmon() {
lock(&sched.lock)
sched.nmsys++
checkdead()
unlock(&sched.lock)

lasttrace := int64(0)
idle := 0 // how many cycles in succession we had not wokeup somebody
delay := uint32(0)

for {
if idle == 0 { // start with 20us sleep...
delay = 20
} else if idle > 50 { // start doubling the sleep after 1ms...
delay *= 2
}
if delay > 10*1000 { // up to 10ms
delay = 10 * 1000
}
usleep(delay)

// sysmon should not enter deep sleep if schedtrace is enabled so that
// it can print that information at the right time.
//
// It should also not enter deep sleep if there are any active P's so
// that it can retake P's from syscalls, preempt long running G's, and
// poll the network if all P's are busy for long stretches.
//
// It should wakeup from deep sleep if any P's become active either due
// to exiting a syscall or waking up due to a timer expiring so that it
// can resume performing those duties. If it wakes from a syscall it
// resets idle and delay as a bet that since it had retaken a P from a
// syscall before, it may need to do it again shortly after the
// application starts work again. It does not reset idle when waking
// from a timer to avoid adding system load to applications that spend
// most of their time sleeping.
now := nanotime()
if debug.schedtrace <= 0 && (sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs) {
lock(&sched.lock)
if sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs {
syscallWake := false
next := timeSleepUntil()
if next > now {
sched.sysmonwait.Store(true)
unlock(&sched.lock)
// Make wake-up period small enough
// for the sampling to be correct.
sleep := forcegcperiod / 2
if next-now < sleep {
sleep = next - now
}
shouldRelax := sleep >= osRelaxMinNS
if shouldRelax {
osRelax(true)
}
syscallWake = notetsleep(&sched.sysmonnote, sleep)
if shouldRelax {
osRelax(false)
}
lock(&sched.lock)
sched.sysmonwait.Store(false)
noteclear(&sched.sysmonnote)
}
if syscallWake {
idle = 0
delay = 20
}
}
unlock(&sched.lock)
}

lock(&sched.sysmonlock)
// Update now in case we blocked on sysmonnote or spent a long time
// blocked on schedlock or sysmonlock above.
now = nanotime()

// trigger libc interceptors if needed
if *cgo_yield != nil {
asmcgocall(*cgo_yield, nil)
}
// poll network if not polled for more than 10ms
lastpoll := sched.lastpoll.Load()
if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
sched.lastpoll.CompareAndSwap(lastpoll, now)
list, delta := netpoll(0) // non-blocking - returns list of goroutines
if !list.empty() {
// Need to decrement number of idle locked M's
// (pretending that one more is running) before injectglist.
// Otherwise it can lead to the following situation:
// injectglist grabs all P's but before it starts M's to run the P's,
// another M returns from syscall, finishes running its G,
// observes that there is no work to do and no other running M's
// and reports deadlock.
incidlelocked(-1)
injectglist(&list)
incidlelocked(1)
netpollAdjustWaiters(delta)
}
}
if GOOS == "netbsd" && needSysmonWorkaround {
// netpoll is responsible for waiting for timer
// expiration, so we typically don't have to worry
// about starting an M to service timers. (Note that
// sleep for timeSleepUntil above simply ensures sysmon
// starts running again when that timer expiration may
// cause Go code to run again).
//
// However, netbsd has a kernel bug that sometimes
// misses netpollBreak wake-ups, which can lead to
// unbounded delays servicing timers. If we detect this
// overrun, then startm to get something to handle the
// timer.
//
// See issue 42515 and
// https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094.
if next := timeSleepUntil(); next < now {
startm(nil, false, false)
}
}
if scavenger.sysmonWake.Load() != 0 {
// Kick the scavenger awake if someone requested it.
scavenger.wake()
}
// retake P's blocked in syscalls
// and preempt long running G's
if retake(now) != 0 {
idle = 0
} else {
idle++
}
// check if we need to force a GC
if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && forcegc.idle.Load() {
lock(&forcegc.lock)
forcegc.idle.Store(false)
var list gList
list.push(forcegc.g)
injectglist(&list)
unlock(&forcegc.lock)
}
if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace)*1000000 <= now {
lasttrace = now
schedtrace(debug.scheddetail > 0)
}
unlock(&sched.sysmonlock)
}
}
  • 如果检测到某个 P 的状态为 Prunning,并且它已经运行了超过10ms,则会将 P 的当前的 G 的 stackguard 设置为 StackPreempt
  • 这个操作其实是相当于加上一个标记,通知这个 G 在合适时机进行调度

内存管理

go 是一门带垃圾回收的语言,go 内存管理机制主要有两个方面:

  • 一个方面是内存池
  • 一个方面是垃圾回收

内存池

go 的内存分配器采用了跟 tcmalloc 库相同的实现,是一个带内存池的分配器,底层直接调用操作系统的 mmap 等函数

  • 在多线程方面,每条线程都有自己的本地的内存,当某个线程中内存不足后就向全局分配链中申请内存
  • 在避免内存碎片方面,大块内存直接按页为单位分配,小块内存会切成各种不同的固定大小的块,申请做任意字节内存时会向上取整到最接近的块,将整块分配给申请者以避免随意切割

go 中为每个系统线程分配一个本地的 MCache(结构体 M 中的 MCache 域)

  • 少量的地址分配就直接从 MCache 中分配,并且定期做垃圾回收
  • 大对象直接从全局控制堆上以页(4k)为单位进行分配

分配器的数据结构包括:

  • FixAlloc:固定大小(128kB)的对象的空闲链分配器,被分配器用于管理存储
  • MHeap:分配堆,按页的粒度进行管理(4kB)
  • MSpan:一些由 MHeap 管理的页(分配内存时的基本单元),会切分为等大的内存块
  • MCentral:对于给定尺寸类别的共享的 free list(本质上是空闲列表)
  • MCache:用于小对象的每 M 一个的 cache

垃圾收集

go 语言的垃圾收集有两个策略:

  • 标记清扫算法(go-1.3)
    • 判断一个对象是否为垃圾(从 root 区域的对象是否有直接或间接的引用到这个对象)
    • 开始标记,从根对象出发查找并标记堆中所有存活的对象
    • 遍历堆中的全部对象,回收未被标记的垃圾对象并将回收的内存加入空闲链表
  • 三色标记法(go-1.5)
    • 黑色 Black:表示对象是可达的,即使用中的对象,黑色是已经被扫描的对象
    • 灰色 Gary:表示被黑色对象直接引用的对象,但还没对它进行扫描
    • 白色 White:白色是对象的初始颜色,如果扫描完成后,对象依然还是白色的,说明此对象是垃圾对象
    • 三色标记规则:黑色不能指向白色对象,即黑色可以指向灰色,灰色可以指向白色
  • 增量收集器(go-1.8)
    • 三色标记 + 混合写屏障

运行时符号信息

go 语言 panic 时会有 traceback,不仅有函数调用,还有文件名和行号等信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
package main

import "fmt"

func main() {
defer func() {
fmt.Println("test")
}()

var i = 1
var j = 0
if j == 0 {
panic("err") /* panic会导致程序提前返回,同时调用defer语句 */
}
k := i / j
fmt.Printf("%d / %d = %d\n", i, j, k)
}
1
2
3
4
5
6
test
panic: err

goroutine 1 [running]:
main.main()
/home/yhellow/桌面/gotest/test.go:13 +0x49

虽然 C 语言的 assert 也能实现这个效果,但底层原理完全不同:

1
__assert_fail("0", "test.c", 8u, "main");
  • C 语言编译器直接将要输出的数据写入 __assert_fail 函数

pclntab 简析

编译器在编译的时候会生成一些额外信息(会记录下函数地址对应的源文件行号,也就是 pc->line 的一张表,简称 pclntab),运行时符号信息就是这样生成的

pclntab 全名是 Program Counter Line Table 程序计数器行数映射表,概要结构如下:

  • Magic Number:魔数
  • instruction size quantum
  • ptr size
  • functions number:函数数量
  • srcfile count number:源文件数量
  • text section start addr:代码段基址
  • func names table offset:函数名称表偏移
  • src file table offset:源码路径表偏移
  • pc table offset:PC表偏移
  • func table offset:函数表偏移

使用 go_parser-master 处理后的 IDA 分析数据如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
.gopclntab:00000000004B8220 F1 FF FF FF                   runtime_symtab dd 0FFFFFFF1h            ; DATA XREF: LOAD:0000000000400398↑o
.gopclntab:00000000004B8220 ; .noptrdata:runtime_firstmoduledata↓o
.gopclntab:00000000004B8220 ; Magic Number
.gopclntab:00000000004B8224 00 00 dw 0
.gopclntab:00000000004B8226 01 db 1 ; instruction size quantum
.gopclntab:00000000004B8227 08 db 8 ; ptr size
.gopclntab:00000000004B8228 9B 05 00 00 00 00 00 00 dq 59Bh ; Functions number
.gopclntab:00000000004B8230 B1 00 00 00 00 00 00 00 dq 0B1h ; srcfile count number
.gopclntab:00000000004B8238 00 10 40 00 00 00 00 00 dq offset internal_cpu_Initialize ; text section start addr, =firstmoduladata.text
.gopclntab:00000000004B8240 60 00 00 00 00 00 00 00 dq 60h ; func names table offset, real addr: 0x4b8280
.gopclntab:00000000004B8248 A0 C1 00 00 00 00 00 00 dq 0C1A0h ; Source file table addr: 0x4c4b60
.gopclntab:00000000004B8250 40 C9 00 00 00 00 00 00 dq 0C940h ; src file table offset, real addr: 0x4c4b60
.gopclntab:00000000004B8258 C0 E3 00 00 00 00 00 00 dq 0E3C0h ; pc table offset, real addr: 0x4c65e0
.gopclntab:00000000004B8260 00 A9 03 00 00 00 00 00 dq 3A900h ; func table offset, real addr: 0x4f2b20

每个函数都可以拥有一些元数据和 PC-Value 表,运行时符号信息由编译器在编译的时候生成,存放在可执行文件中,当程序被执行时,这张表被加载到内存,用于程序运行时辅助 go 的运行时库执行一些处理

一个函数符号表的形式就是一张 PC 的查找表,IDA 分析数据如下:

1
2
3
4
5
6
7
8
9
10
.gopclntab:00000000004F2B20 00 00 00 00                   runtime_functab dd 0                    ; DATA XREF: .noptrdata:0000000000517D28↓o
.gopclntab:00000000004F2B20 ; .noptrdata:0000000000517D40↓o
.gopclntab:00000000004F2B20 ; Function internal_cpu_Initialize @ 0x401000
.gopclntab:00000000004F2B24 E0 2C 00 00 dd 2CE0h ; Func Struct @ 0x4f5800
.gopclntab:00000000004F2B28 60 00 00 00 dd 60h ; Function internal_cpu_processOptions @ 0x401060
.gopclntab:00000000004F2B2C 38 2D 00 00 dd 2D38h ; Func Struct @ 0x4f5858
.gopclntab:00000000004F2B30 C0 05 00 00 dd 5C0h ; Function internal_cpu_doinit @ 0x4015c0
.gopclntab:00000000004F2B34 90 2D 00 00 dd 2D90h ; Func Struct @ 0x4f58b0
.gopclntab:00000000004F2B38 40 0E 00 00 dd 0E40h ; Function internal_cpu_cpuid @ 0x401e40
.gopclntab:00000000004F2B3C D8 2D 00 00 dd 2DD8h ; Func Struct @ 0x4f58f8
  • 第一个条目是函数地址,第二个条目是 Func Struct(用于描述该函数的信息)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
.gopclntab:00000000004F5800 00 00 00 00                   dd 0                                    ; Func Entry @ 0x401000
.gopclntab:00000000004F5804 00 00 00 00 dd 0
.gopclntab:00000000004F5808 10 00 00 00 dd 10h ; args
.gopclntab:00000000004F580C 00 00 00 00 dd 0 ; deferreturn
.gopclntab:00000000004F5810 01 00 00 00 dd 1 ; pcsp
.gopclntab:00000000004F5814 08 00 00 00 dd 8 ; pcfile
.gopclntab:00000000004F5818 0B 00 00 00 dd 0Bh ; pcln
.gopclntab:00000000004F581C 04 00 00 00 dd 4 ; npcdata
.gopclntab:00000000004F5820 00 00 00 00 dd 0 ; cuOffset
.gopclntab:00000000004F5824 7D 00 00 00 dd 7Dh ; startline
.gopclntab:00000000004F5828 00 db 0 ; func_type: normal
.gopclntab:00000000004F5829 00 db 0 ; func_flag
.gopclntab:00000000004F582A 00 db 0
.gopclntab:00000000004F582B 07 db 7 ; nfuncdata
  • 在 go 源码中对应的结构体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
type _func struct {
sys.NotInHeap // Only in static data

entryOff uint32 // start pc, as offset from moduledata.text/pcHeader.textStart
nameOff int32 // function name, as index into moduledata.funcnametab.

args int32 // in/out args size
deferreturn uint32 // offset of start of a deferreturn call instruction from entry, if any.

pcsp uint32
pcfile uint32
pcln uint32
npcdata uint32
cuOffset uint32 // runtime.cutab offset of this function's CU
startLine int32 // line number of start of function (func keyword/TEXT directive)
funcID abi.FuncID // set for certain special runtime functions
flag abi.FuncFlag
_ [1]byte // pad
nfuncdata uint8 // must be last, must end on a uint32-aligned boundary

// The end of the struct is followed immediately by two variable-length
// arrays that reference the pcdata and funcdata locations for this
// function.

// pcdata contains the offset into moduledata.pctab for the start of
// that index's table. e.g.,
// &moduledata.pctab[_func.pcdata[_PCDATA_UnsafePoint]] is the start of
// the unsafe point table.
//
// An offset of 0 indicates that there is no table.
//
// pcdata [npcdata]uint32

// funcdata contains the offset past moduledata.gofunc which contains a
// pointer to that index's funcdata. e.g.,
// *(moduledata.gofunc + _func.funcdata[_FUNCDATA_ArgsPointerMaps]) is
// the argument pointer map.
//
// An offset of ^uint32(0) indicates that there is no entry.
//
// funcdata [nfuncdata]uint32
}