编译原理-Lab4

第一个任务

这部分的实验要完成将 TAC 的指令序列转换成目标代码，本文中在这里对实现做了一些限制，假设数据类型只包含整数类型，不包含如浮点数、数组、结构和指针等其它数据类型（也不包括全局变量）

目标语言为汇编语言，这样做的目的就是尽可能简单地实现目标代码的生成并能运行程序观察运行结果，完成了一个可运行的编译程序

MIPS32 架构

MIPS32 的架构是一种基于固定长度的定期编码指令集，并采用导入/存储（load/store）数据模型

目标语言可选定 MIPS32 指令序列，可以在 SPIM Simulator 上运行

MIPS32 体系结构有 :

32个通用寄存器
2个特殊寄存器（整数乘除法寄存器）
32个浮点寄存器
PC：程序计数器（在 MIPS32 下，PC 不是一个通用寄存器）

本实验只使用 t1-t9 s0-s7，并且不区分功能

TAC 指令和 MIPS32 指令的对应关系如下表所示：

中间代码	MIPS32 指令
LABEL x	x:
x :=#k	li reg(x),k
x := y	move reg(x), reg(y)
x := y + z	add reg(x), reg(y) , reg(z)
x := y - z	sub reg(x), reg(y) , reg(z)
x := y * z	mul reg(x), reg(y) , reg(z)
x := y / z	div reg(y) , reg(z) mflo reg(x)
GOTO x	j x
RETURN x	move $v0, reg(x) jr $ra
IF x==y GOTO z	beq reg(x),reg(y),z
IF x!=y GOTO z	bne reg(x),reg(y),z
IF x>y GOTO z	bgt reg(x),reg(y),z
IF x>=y GOTO z	bge reg(x),reg(y),z
IF x<y GOTO z	ble reg(x),reg(y),z
IF x<=y GOTO z	blt reg(x),reg(y),z
X:=CALL f	jal f move reg(x),$v0

其中 reg(x) 表示变量 x 所分配的寄存器
寄存器 v0 用来存放子程序的返回值

安装 MIPS 编译器和模拟器

安装编译器和依赖：

1 2	sudo apt-get install emdebian-archive-keyring sudo apt-get install linux-libc-dev-mips-cross libc6-mips-cross libc6-dev-mips-cross binutils-mips-linux-gnu gcc-mips-linux-gnu g++-mips-linux-gnu

检测是否安装成功：

1	mips-linux-gnu-gcc -dumpmachine

安装 mips 虚拟机：

1	sudo apt-get install qemu-system-mips

下载内核映像和初始化文件：（不推荐）

1
2

wget http://ftp.debian.org/debian/dists/stable/main/installer-mipsel/current/images/malta/netboot/initrd.gz
wget http://ftp.debian.org/debian/dists/stable/main/installer-mipsel/current/images/malta/netboot/vmlinuz-5.10.0-20-4kc-malta

创建虚拟磁盘：（不推荐）

1	qemu-img create -f qcow2 hda.img 2G

开启虚拟机：（不推荐）

qemu-system-mips -M malta \
  -m 512 -hda hda.img \
  -kernel vmlinuz-5.10.0-20-4kc-malta \
  -initrd initrd.gz \
  -append "console=ttyS0 nokaslr" \
  -nographic

上面这种方法有点 BUG，目前没有启动成功

下载内核映像和初始化文件：（推荐）

1 2	wget http://people.debian.org/\~aurel32/qemu/mips/debian_squeeze_mips_standard.qcow2 wget http://people.debian.org/~aurel32/qemu/mips/vmlinux-2.6.32-5-4kc-malta

开启虚拟机：（推荐）

1	qemu-system-mips -M malta -kernel vmlinux-2.6.32-5-4kc-malta -hda debian_squeeze_mips_standard.qcow2 -append "root=/dev/sda1 console=tty0"

user：user
password：user

安装 libguestfs：用于挂载 qcow2 磁盘镜像

1	sudo apt-get install libguestfs-tools

挂载/卸载命令：

sudo guestmount -a debian_squeeze_mips_standard.qcow2 -m /dev/sda1 rootfs
sudo cp $1 rootfs/home/user
sudo chmod 777 rootfs/home/user/$1
sudo guestunmount rootfs

寄存器分配算法

在目标生成阶段，一个很重要的工作就是寄存器的分配：

最为简单的就是朴素的寄存器分配算法，效率最低，也最容易实现
对于一个基本块采用的局部寄存器分配算法，实现起来相对不是太难，且效率上有一定提升

本实验采用朴素的寄存器分配算法：

将所有的变量或临时变量都放入内存中
每翻译一条中间代码之前都需要吧要用到的变量先加载到寄存器中，得到该代码的计算结果之后又需要将结果写回内存

可分配的寄存集合：（不区分功能）

1	string regs[] = {"t1","t2","t3","t4","t5","t6","t7","t8","t9","s0","s1","s2","s3","s4","s5","s6","s7"};

核心算法如下：

vector<string> variables; /* 变量数组 */
map<string,string> table; /* 记录已经分配寄存器的变量{key:变量,value:寄存器} */
map<string,int> reg_ok;

string Get_R(const string& temp_str){ /* 引用是原变量的一个别名,跟原来的变量实质上是同一个东西 */
    for (auto it = variables.begin();it!=variables.end();++it){ /* it为迭代器 */
        if (*it == temp_str){ /* 去除重复 */
            it = variables.erase(it);
            break;
        }
    } 

    if (table.find(temp_str) != table.end()){
        /* 如果已经存在寄存器分配,那么直接返回寄存器 */
        return "$"+table[temp_str];
    }
    else{ /* 目前传入的变量只有两种类型:temp临时变量,var函数形参 */
        vector<string> keys;
        for (auto &it:table) /* 遍历table的key(已经分配寄存器) */
            /* 类似于python的"for i in data" */
            keys.emplace_back(it.first); /* 直接在vector尾部创建这个元素 */

        for (auto &key:keys){ /* 当遇到未分配寄存器的变量时,清空之前所有分配的临时变量的映射关系(临时变量只使用一次) */
            if (key.find("temp")!=string::npos && 
                find(variables.begin(),variables.end(),key) == variables.end()){
                reg_ok[table[key]] = 1;
                table.erase(key);
            }
        } 

        for (const auto & reg : regs){ /* 对于所有寄存器 */
            if(reg_ok[reg] == 1){ /* 如果寄存器可用 */
                table[temp_str] = reg; /* 将可用寄存器分配给该变量,映射关系存到table中 */
                reg_ok[reg] = 0; /* 寄存器reg设置为已用 */
                return "$"+reg;
            }
        } 
    }
}

朴素寄存器分配的翻译

翻译的操作如下表：

中间代码	MIPS32 指令
x :=#k	li $t3 ,k sw $t3, x的偏移量($sp)
x := y	lw $t1, y的偏移量($sp) move $t3 , $t1 sw $t3, x的偏移量($sp)
x := y + z	lw $t1, y的偏移量($sp) lw $t2, z的偏移量($sp) add $t3 , $t1,$t2 sw $t3, x的偏移量($sp)
x := y - z	l w $t1, y的偏移量($sp) l w $t2, z的偏移量($sp) sub $t3 , $t1,$t2 sw $t3, x的偏移量($sp)
x := y * z	l w $t1, y的偏移量($sp) l w $t2, z的偏移量($sp) mul $t3 , $t1,$t2 sw $t3, x的偏移量($sp)
x := y / z	l w $t1, y的偏移量($sp) l w $t2, z的偏移量($sp) mul $t3 , $t1,$t2 div $t1,$t2 mflo $t3 sw $t3, x的偏移量($sp)
RETURN x	move $v0, x的偏移量($sp)jr $ra
IF x==y GOTO z	l w $t1, x的偏移量($sp) l w $t2, y的偏移量($sp) beq $t1,$t2 ,z
IF x!=y GOTO z	l w $t1, x的偏移量($sp) l w $t2, y的偏移量($sp) bne $t1,$t2 ,z
IF x>y GOTO z	l w $t1, x的偏移量($sp) l w $t2, y的偏移量($sp) bgt $t1,$t2 ,z
IF x>=y GOTO z	l w $t1, x的偏移量($sp) l w $t2, y的偏移量($sp) bge $t1,$t2 ,z
IF x<y GOTO z	l w $t1, x的偏移量($sp) l w $t2, y的偏移量($sp) ble $t1,$t2 ,z
IF x<=y GOTO z	l w $t1, x的偏移量($sp) l w $t2, y的偏移量($sp) blt $t1,$t2 ,z
X:=CALL f

对于函数调用 X:=CALL f，需要完成开辟活动记录的空间、参数的传递和保存返回地址等，函数调用返回后，需要恢复返回地址，读取函数返回值以及释放活动记录空间（活动记录的空间布局没有一个统一的标准，可根据自己的理解保存好数据，并能正确使用即可）

通常，使用4个寄存器完成参数的传递，多余4个的参数使用活动记录空间，这里做了简单处理，所有参数都使用活动记录空间，具体步骤：

首先根据保存在函数调用指令中的 offset，找到符号表中的函数定义点，获取函数的参数个数i，这样就可得到在 X:=CALL f 之前的 i 个 ARG 形式的中间代码，获得 i 个实参值所存放的单元，取出后送到形式参数的单元中
根据符号表记录的活动记录大小，开辟活动记录空间和保存返回地址
使用 jal f 转到函数f处
释放活动记录空间和恢复返回地址
使用 sw $v0 , x 的偏移量 ($sp) 获取返回值送到 X 的存储单元中

TAC 转化为 MIPS32

这就是本实验最复杂的步骤了，首先需要一个函数来读取记录中间变量的文件：

string Load_Inter(const string& filename){
    string lines;
    string temp_line;
    ifstream in(filename);
    if(in){
        while (getline(in,temp_line)){
            if (temp_line == " ")
                continue;
            lines.append(temp_line);
            lines.append("\n");
        }
    }
    in.close();
    return lines;
}

另一个函数来记录所有变量到 variables 中：

vector<string> variables; /* 变量数组 */

void Load_Var(string Inter){ /* 传入参数为中间代码字符串 */
    regex temp_re("temp\\d+"); /* 正则表达式{temp+任意数字} */
    regex var_re("var\\d+");
    smatch result;
    string temp_line;
    
    string::const_iterator iter = Inter.begin();
    string::const_iterator iterEnd = Inter.end();
    while (regex_search(iter,iterEnd,result,temp_re)){ /* 匹配regex规则 */
        temp_line = result[0];
        variables.emplace_back(temp_line); /* 记录到变量数组中 */
        iter = result[0].second; /* 更新搜索起始位置,搜索剩下的字符串 */
    }
    iter = Inter.begin();
    iterEnd = Inter.end();
    while (regex_search(iter,iterEnd,result,var_re)){
        temp_line = result[0];
        variables.emplace_back(temp_line);
        iter = result[0].second;
    } 
}

所有临时变量均分配在寄存器中

最后需要一个函数来逐行进行翻译，我们把这个函数拆分为多段来分析：

将每行 string 按空格存成数组：

vector<string> line;
string temp_res;
stringstream input(temp_str); 
while (input>>temp_res) /* 可以理解为队列,只能从头节点取出值,新数据接在尾节点 */
    line.emplace_back(temp_res); 

if(line.size()==0){
    return " "; /* 规避'\n'的影响 */
}

stringstream 类相当于一个管道（队列），可以用它来分割字符串

处理简单指令：

size_t func_argn;
size_t func_param;

string temp_return;
if(line[0] == "LABEL"){ /* 标记跳转 */
    return line[1]+":";
}
if (line[0] == "FUNCTION"){ /* 标记函数 */
    for (const auto & reg : regs) /* 重置reg_ok和table */
        reg_ok[reg] = 1;
    table.clear();
    func_param = 0;
    return line[1]+":";
}
if (line[0] == "RETURN"){ /* 标记返回 */
    func_argn = 0;
    return "\tmove $v0,"+Get_R(line[1])+"\n\tjr $ra";
}
if (line[0] == "GOTO"){ /* 执行跳转 */
    return "\tj "+line[1];
}

中间代码转化为 MIPS 的过程以函数为单位，读取到一个新的函数标签时，应该重置 reg_ok 和 table
当一个函数调用其子函数时，程序会将寄存器信息保存在栈中，我们不用担心覆盖寄存器带来的影响

处理函数参数：

string params[] = {"a0","a1","a2","a3"};

if (line[0] == "ARG"){ /* 为函数参数分配寄存器 */
    func_argn++;
    return "\tmove $"+params[func_argn-1]+","+Get_R(line.back());
}
if (line[0] == "PARAM"){ /* 声明函数参数的寄存器 */
    func_param++;
    table[line.back()] = params[func_param-1]; /* 永久记录对应的{var,params} */
}

函数传参使用 a0-a3（规定函数参数不超过4个）
全局变量 func_argn 和 func_param 用于记录当前参数的标号

处理函数调用：

if (line[0] == "CALL"){ /* 函数调用 */
    func_argn = 0;
    return string("")+"\taddi $sp,$sp,-60\n\tsw $ra,0($sp)\n\tsw $t0,4($sp)\n\tsw $t1,8($sp)\n\tsw $t2,12($sp)\n\tsw $t3,16($sp)\n"+ \
        "\tsw $t4,20($sp)\n\tsw $t5,24($sp)\n\tsw $t6,28($sp)\n\tsw $t7,32($sp)\n\tsw $t8,36($sp)\n\tsw $t9,40($sp)\n"+ \
        "\tsw $a0,44($sp)\n\tsw $a1,48($sp)\n\tsw $a3,52($sp)\n\tsw $a4,56($sp)\n"+ \
        "\tjal "+line.back()+"\n"+ \
        "\tlw $ra,0($sp)\n\tlw $t0,4($sp)\n\tlw $t1,8($sp)\n\tlw $t2,12($sp)\n\tlw $t3,16($sp)\n"+ \
        "\tlw $t4,20($sp)\n\tlw $t5,24($sp)\n\tlw $t6,28($sp)\n\tlw $t7,32($sp)\n\tlw $t8,36($sp)\n\tlw $t9,40($sp)\n"+ \
        "\tlw $a0,44($sp)\n\tlw $a1,48($sp)\n\tlw $a3,52($sp)\n\tlw $a4,56($sp)\n"+ \
        "\taddi $sp,$sp,60";
}

调用前保存寄存器，调用后恢复寄存器（t1-t9 a0-a3）

IF 判断语句：

if (line[0] == "IF") {
    if (line[2] == "=="){
        temp_return = "\tbeq ";
        temp_return += Get_R(line[1])+",";
        temp_return += Get_R(line[3])+",";
        temp_return += line.back();
        return temp_return;
    }
    if (line[2] == "!="){
        temp_return = "\tbne ";
        temp_return += Get_R(line[1])+",";
        temp_return += Get_R(line[3])+",";
        temp_return += line.back();
        return temp_return;
    }
    if (line[2] == ">"){
        temp_return = "\tbgt ";
        temp_return += Get_R(line[1])+",";
        temp_return += Get_R(line[3])+",";
        temp_return += line.back();
        return temp_return;
    }
    if (line[2] == "<"){
        temp_return = "\tblt ";
        temp_return += Get_R(line[1])+",";
        temp_return += Get_R(line[3])+",";
        temp_return += line.back();
        return temp_return;
    }
    if (line[2] == ">="){
        temp_return = "\tbge ";
        temp_return += Get_R(line[1])+",";
        temp_return += Get_R(line[3])+",";
        temp_return += line.back();
        return temp_return;
    }
    if (line[2] == "<="){
        temp_return = "\tble ";
        temp_return += Get_R(line[1])+",";
        temp_return += Get_R(line[3])+",";
        temp_return += line.back();
        return temp_return;
    }
}

根据不同的条件，使用对应的汇编代码即可

赋值语句：

if (line[1] == ":=") {
    if (line.size() == 3){ /* 对应简单赋值语句 */
        if (line[2][0] == '#'){ // 立即数
            return "\tli " + Get_R(line[0]) + ","+line.back().substr(1);
        }
        else{
            temp_return = "\tmove "; // 寄存器
            temp_return += Get_R(line[0])+',';
            temp_return += Get_R(line[2]);
            return temp_return;
        }
    }
    if (line.size() == 5){ /* 对应带有运算的赋值语句 */
        if (line[3] == "+"){
            if (line[2][0] == '#'){ // 立即数
                temp_return = "\taddi ";
                temp_return += Get_R(line[0])+",";
                temp_return += Get_R(line[2])+",";
                temp_return += line.back().substr(1);
                return temp_return;
            }
            else{
                temp_return = "\tadd "; // 寄存器
                temp_return += Get_R(line[0])+",";
                temp_return += Get_R(line[2])+",";
                temp_return += Get_R(line.back());
                return temp_return;
            }
        }
        else if (line[3] == "-"){
            if (line[2][0] == '#'){ // sub不支持使用立即数
                temp_return = "";
                return temp_return;
            }
            else{
                temp_return = "\tsub "; // 寄存器
                temp_return += Get_R(line[0])+",";
                temp_return += Get_R(line[2])+",";
                temp_return += Get_R(line.back());
                return temp_return;
            }
        }
        else if (line[3] == "*"){
            temp_return = "\tmul ";
            temp_return += Get_R(line[0])+",";
            temp_return += Get_R(line[2])+",";
            temp_return += Get_R(line.back());
            return temp_return;
        }
        else if (line[3] == "/"){
            temp_return = "\tdiv ";
            temp_return += Get_R(line[2])+",";
            temp_return += Get_R(line.back()) + "\n\tmflo ";
            temp_return += Get_R(line[0]);
            return temp_return;
        }
        else if (line[3] == "<"){ /* 左移 */
            temp_return = "\tslt ";
            temp_return += Get_R(line[0])+",";
            temp_return += Get_R(line[2])+",";
            temp_return += Get_R(line.back());
            return temp_return;
        }
        else if (line[3] == ">"){ /* 右移 */
            temp_return = "\tslt ";
            temp_return += Get_R(line[0])+",";
            temp_return += Get_R(line.back())+",";
            temp_return += Get_R(line[2]);
            return temp_return;
        }
    } 
    if (line[2] == "CALL") /* 对应带有函数调用的赋值语句 */
    {
        func_argn = 0;
        return string("")+"\taddi $sp,$sp,-60\n\tsw $ra,0($sp)\n\tsw $t0,4($sp)\n\tsw $t1,8($sp)\n\tsw $t2,12($sp)\n\tsw $t3,16($sp)\n"+ \
            "\tsw $t4,20($sp)\n\tsw $t5,24($sp)\n\tsw $t6,28($sp)\n\tsw $t7,32($sp)\n\tsw $t8,36($sp)\n\tsw $t9,40($sp)\n"+ \
            "\tsw $a0,44($sp)\n\tsw $a1,48($sp)\n\tsw $a2,52($sp)\n\tsw $a3,56($sp)\n"+ \
            "\tjal "+line.back()+"\n"+ \
            "\tlw $ra,0($sp)\n\tlw $t0,4($sp)\n\tlw $t1,8($sp)\n\tlw $t2,12($sp)\n\tlw $t3,16($sp)\n"+ \
            "\tlw $t4,20($sp)\n\tlw $t5,24($sp)\n\tlw $t6,28($sp)\n\tlw $t7,32($sp)\n\tlw $t8,36($sp)\n\tlw $t9,40($sp)\n"+ \
            "\tlw $a0,44($sp)\n\tlw $a1,48($sp)\n\tlw $a2,52($sp)\n\tlw $a3,56($sp)\n"+ \
            "\taddi $sp,$sp,60\n\tmove "+Get_R(line[0])+", $v0";
    }
}

根据不同的条件，使用对应的汇编代码即可

最后需要一个函数组织并输出 demo.s 汇编文件：

void write_to_txt(const vector<string>& obj){
    ofstream out("./demo.s");
    string temp =".data\n"
                "_prompt: .asciiz \"Enter an integer:\"\n"
                "_ret: .asciiz \"\\n\"\n"
                ".globl main\n"
                ".text\n";
    out<<temp;
    for (auto & it:obj)
        out<<it<<endl;
    out.close();
}

编译与结果

编译命令：

1	g++ main.cpp -o test4

测试文件：

int fast(int n)
{
	int a1 = 14;
	int a2 = 15;
	int a3 = 16;
	int a4;
	int i;

	for(i=0;i<n;i++){
		a4 = (a1+a2)*a3;
	}
	
	return 0;
}
int main()
{
	int n = 5;
	fast(n);
	return 0;
}

中间代码：

FUNCTION fast :
  PARAM var2
  temp1 := #14
  var3 := temp1
  temp2 := #15
  var4 := temp2
  temp3 := #16
  var5 := temp3
  temp4 := #0
  var7 := temp4
LABEL label4 :
  IF var7 < var2 GOTO label3
  GOTO label2
LABEL label3 :
  temp5 := var3 + var4
  temp6 := temp5 * var5
  var6 := temp6
  var7 := var7 + 1
  GOTO label4
LABEL label2 :
  temp7 := #0
  RETURN temp7
LABEL label1 :

FUNCTION main :
  temp8 := #5
  var9 := temp8
  ARG var9
  temp9 := CALL fast
  temp10 := #0
  RETURN temp10
LABEL label5 :

汇编代码：

.data
_prompt: .asciiz "Enter an integer:"
_ret: .asciiz "\n"
.globl main
.text
fast:
	li $t1,14
	move $t2,$t1
	li $t1,15
	move $t3,$t1
	li $t1,16
	move $t4,$t1
	li $t1,0
	move $t5,$t1
label4:
	blt $t5,$a0,label3
	j label2
label3:
	add $t1,$t2,$t3
	mul $t6,$t1,$t4
	move $t1,$t6
	add $t5,$t5,$t6
	j label4
label2:
	li $t7,0
	move $v0,$t7
	jr $ra
label1:
main:
	li $t1,5
	move $t2,$t1
	move $a0,$t2
	addi $sp,$sp,-60
	sw $ra,0($sp)
	sw $t0,4($sp)
	sw $t1,8($sp)
	sw $t2,12($sp)
	sw $t3,16($sp)
	sw $t4,20($sp)
	sw $t5,24($sp)
	sw $t6,28($sp)
	sw $t7,32($sp)
	sw $t8,36($sp)
	sw $t9,40($sp)
	sw $a0,44($sp)
	sw $a1,48($sp)
	sw $a2,52($sp)
	sw $a3,56($sp)
	jal fast
	lw $ra,0($sp)
	lw $t0,4($sp)
	lw $t1,8($sp)
	lw $t2,12($sp)
	lw $t3,16($sp)
	lw $t4,20($sp)
	lw $t5,24($sp)
	lw $t6,28($sp)
	lw $t7,32($sp)
	lw $t8,36($sp)
	lw $t9,40($sp)
	lw $a0,44($sp)
	lw $a1,48($sp)
	lw $a2,52($sp)
	lw $a3,56($sp)
	addi $sp,$sp,60
	move $t1, $v0
	li $t1,0
	move $v0,$t1
	jr $ra
label5: