简单JavaScript解释器

前言

在学习了一款 JavaScript 解释器的项目过后，想自己写一个 JavaScript 解释器：zuluoaaa/makeJs: A sub Javascript interpreter for interpreting itself (github.com)

在数据结构和分析逻辑上面很大程度参考了以上项目，不过我在原项目的基础上进行了魔改并添加了新的功能

PS：本项目只是简单的模拟了 JavaScript 解释器的执行过程，并没有涉及 JavaScript 的灵魂（一切皆对象）

词法分析

词法分析核心实现

词法分析其实就是一个线性扫描加上正则匹配的过程，核心函数如下：

function scan(){
    skipBlank();
    let {
        content,
        index,
        token,
        tokenBack
    } = gData;
    token.value = null;
    
    if(tokenBack !== null){
        let token = tokenBack;
        gData.tokenBack = null;
        gData.token.type = token.type;
        gData.token.value = token.value;
        return token;
    }
    if(index >= content.length){
        token.type = tokenTypes.T_EOF;
        return;
    }
    let value = nextChar();
    let next;

    switch (value) {
        case "+":
            token.type = tokenTypes.T_ADD;
            break;
        case "-":
            token.type = tokenTypes.T_SUB;
            break;
        case "*":
            next = nextChar();
            if(next === "/"){
                token.type = tokenTypes.T_RCMT;
            }else{
                putBack(next);
                token.type = tokenTypes.T_MUL;
            }

            break;
        case "/":
            next = nextChar();
            if(next === "*"){
                token.type = tokenTypes.T_LCMT;
            }
            else if(next === "/"){
                token.type = tokenTypes.T_LINE_CMT;
            }
            else{
                putBack(next);
                token.type = tokenTypes.T_DIV;
            }
            break;

        case ",":
            token.type = tokenTypes.T_COMMA;
            break;
        case "=":
            next = nextChar();
            if(next === "="){
                token.type = tokenTypes.T_EQ;
                next = nextChar();
                if(next !== "="){
                    putBack(next);
                }
            }else{
                token.type = tokenTypes.T_ASSIGN;
                putBack(next);
            }
            break;
        case ";":
            token.type = tokenTypes.T_SEMI;
            break;
        case "!":
            next = nextChar();
            if(next === "="){
                token.type = tokenTypes.T_NEQ;
            }
            putBack(next);
            token.type = tokenTypes.T_NOT;
            break;
        case ">":
             next = nextChar();
            if(next === "="){
                token.type = tokenTypes.T_GE;
            }else {
                token.type = tokenTypes.T_GT;
                putBack(next);
            }
            break;
        case "<":
             next = nextChar();
            if(next === "="){
                token.type = tokenTypes.T_LE;
            }else {
                token.type = tokenTypes.T_LT;
                putBack(next);
            }
            break;
        case "&":
            next = nextChar();
            if(next === "&"){
                token.type = tokenTypes.T_AND;
            }else {
                putBack(next);
            }
            break;
        case "|":
            next = nextChar();
            if(next === "|"){
                token.type = tokenTypes.T_OR;
            }else {
                putBack(next);
            }
            break;
        case "(":
            token.type = tokenTypes.T_LPT;
            break;
        case ")":
            token.type = tokenTypes.T_RPT;
            break;
        case "{":
            token.type = tokenTypes.T_LBR;
            break;
        case "}":
            token.type = tokenTypes.T_RBR;
            break;
        case "[":
            token.type = tokenTypes.T_LMBR;
            break;
        case "]":
            token.type = tokenTypes.T_RMBR;
            break;
        case "?":
            token.type = tokenTypes.T_QST;
            break;
        case ":":
            token.type = tokenTypes.T_COL;
            break;
        case "\"":
            token.type = tokenTypes.T_STRING;
            token.value = scanStr("\"");
            break;
        case "\'":
            token.type = tokenTypes.T_STRING;
            token.value = scanStr("\'");
            break;

        default:
            if(regNumber(value)){
                token.value = scanInt(value);
                token.type = tokenTypes.T_INT;
                break;
            }
            else if(regVar(value)){
                value = scanIdent(value);
                token.type = scanKeyword(value);
                token.value = value;
                break;
            }
            printErr(`Unrecognised char : (${value})`)
        }
    if(token.type === tokenTypes.T_LINE_CMT){
        skipOneLine();
        scan();
    }
    return true;
}

一些简单的符号可以直接匹配，而字符串和数字则需要借助正则表达式

次步骤会将程序简单转化为 token 流，并记录于 lexList 列表中：

function add(a,b){
    return a+b;
}
var a = 1;
var b = 2;
var c;
if(a > b){
    c = add(a,b) * add(a,b);
}
else{
    c = add(a,b) + add(a,b);
}
log(c);

此时 token 就只是标识符，按照顺序排布，没有任何语法上的含义

[
  Token { type: 'function', value: 'function' },
  Token { type: 'identifier', value: 'add' },
  Token { type: '(', value: null },
  Token { type: 'identifier', value: 'a' },
  Token { type: ',', value: null },
  Token { type: 'identifier', value: 'b' },
  Token { type: ')', value: null },
  Token { type: '{', value: null },
  Token { type: 'return', value: 'return' },
  Token { type: 'identifier', value: 'a' },
  Token { type: '+', value: null },
  Token { type: 'identifier', value: 'b' },
  Token { type: ';', value: null },
  Token { type: '}', value: null },
  Token { type: 'var', value: 'var' },
  Token { type: 'identifier', value: 'a' },
  Token { type: '=', value: null },
  Token { type: 'number', value: 1 },
  Token { type: ';', value: null },
  Token { type: 'var', value: 'var' },
  Token { type: 'identifier', value: 'b' },
  Token { type: '=', value: null },
  Token { type: 'number', value: 2 },
  Token { type: ';', value: null },
  Token { type: 'var', value: 'var' },
  Token { type: 'identifier', value: 'c' },
  Token { type: ';', value: null },
  Token { type: 'if', value: 'if' },
  Token { type: '(', value: null },
  Token { type: 'identifier', value: 'a' },
  Token { type: '>', value: null },
  Token { type: 'identifier', value: 'b' },
  Token { type: ')', value: null },
  Token { type: '{', value: null },
  Token { type: 'identifier', value: 'c' },
  Token { type: '=', value: null },
  Token { type: 'identifier', value: 'add' },
  Token { type: '(', value: null },
  Token { type: 'identifier', value: 'a' },
  Token { type: ',', value: null },
  Token { type: 'identifier', value: 'b' },
  Token { type: ')', value: null },
  Token { type: '*', value: null },
  Token { type: 'identifier', value: 'add' },
  Token { type: '(', value: null },
  Token { type: 'identifier', value: 'a' },
  Token { type: ',', value: null },
  Token { type: 'identifier', value: 'b' },
  Token { type: ')', value: null },
  Token { type: ';', value: null },
  Token { type: '}', value: null },
  Token { type: 'else', value: 'else' },
  Token { type: '{', value: null },
  Token { type: 'identifier', value: 'c' },
  Token { type: '=', value: null },
  Token { type: 'identifier', value: 'add' },
  Token { type: '(', value: null },
  Token { type: 'identifier', value: 'a' },
  Token { type: ',', value: null },
  Token { type: 'identifier', value: 'b' },
  Token { type: ')', value: null },
  Token { type: '+', value: null },
  Token { type: 'identifier', value: 'add' },
  Token { type: '(', value: null },
  Token { type: 'identifier', value: 'a' },
  Token { type: ',', value: null },
  Token { type: 'identifier', value: 'b' },
  Token { type: ')', value: null },
  Token { type: ';', value: null },
  Token { type: '}', value: null },
  Token { type: 'identifier', value: 'log' },
  Token { type: '(', value: null },
  Token { type: 'identifier', value: 'c' },
  Token { type: ')', value: null },
  Token { type: ';', value: null }
]

语法分析

各种语句的处理

需要处理的语句有如下几种：

function statement(){
    let tree=null,left=null;
    let {token} = gData;
    getNextToken();
    while(true){
        switch(token.type){
            case tokenTypes.T_VAR:
            case tokenTypes.T_ARGUMENT:
                left = varDeclaration();
                break;
            case tokenTypes.T_IF:
                left = ifStatement();
                break;
            case tokenTypes.T_WHILE:
                left = whileStatement();
                break; 
            case tokenTypes.T_FUN:
                left = funStatement();
                break;
            case tokenTypes.T_RETURN:
                left = returnStatement();
                break;
            case tokenTypes.T_EOF:
            case tokenTypes.T_RBR:
            case tokenTypes.T_RPT:
                return tree;
            default:
                if(prefixParserMap[token.type]){
                    left = normalStatement();
                }else{
                    printErr(`unknown Syntax:${token.type} , at ${gData.line} line`);
                }
        }
        if(left !== null){
            if(tree === null){
                tree = left;
            }else{
                tree = new ASTNode().initTwoNode(ASTNodeTypes.T_GLUE,tree,left,null);
            }
        }
    }
}

glue结点的使用

在语法分析生成 AST 的过程中，可能会遇到一些没有关键的并列语句：

var a = 1;
var b = 2;
var c = 3;
var d = 4;
var e = 5;

这些语句看似是并列结构，但解释器往往会将其理解成有着父子结点关系的树形结构：

glue
|----glue
|    |----glue
|    |    |----glue
|    |    |    |----a
e    d    c    b

解释器会先读取 var a = 1 进而生成 a 节点
接着读取 var b = 2 生成 b 节点，然后创建 glue 节点作为 a b 的父节点

打印出的 AST 信息如下：

 || [_glue] (null),(null)
--- || [_glue] (null),(null)
------ || [_glue] (null),(null)
--------- || [_glue] (null),(null)
------------ || [_glue] (null),(null)
--------------- || [var] (a),(null)
--------------- || [=] (null),(null)
------------------ || [number] (1),(null)
------------------ || [leftValue] (a),(null)
------------ || [_glue] (null),(null)
--------------- || [var] (b),(null)
--------------- || [=] (null),(null)
------------------ || [number] (2),(null)
------------------ || [leftValue] (b),(null)
--------- || [_glue] (null),(null)
------------ || [var] (c),(null)
------------ || [=] (null),(null)
--------------- || [number] (3),(null)
--------------- || [leftValue] (c),(null)
------ || [_glue] (null),(null)
--------- || [var] (d),(null)
--------- || [=] (null),(null)
------------ || [number] (4),(null)
------------ || [leftValue] (d),(null)
--- || [_glue] (null),(null)
------ || [var] (e),(null)
------ || [=] (null),(null)
--------- || [number] (5),(null)
--------- || [leftValue] (e),(null)

列表结点的使用

在语法分析生成 AST 的过程中，可能会遇到没法简单处理成树形结构的数据：

function add(a,b,c,d,e){
    return a+b+c+d+e;
}
var a = [1,2,3,4,5];
var test = add(1,2,3,4,5);

由于 add(1,2,3,4,5) 中的参数必须分析为并列结构，因此这里需要使用列表来存储参数节点
对于函数参数和列表都需要这种处理方式

打印出的 AST 信息如下：

 || [_glue] (null),(null)
--- || [_glue] (null),(null)
------ || [function] (add),(null)
--------- || [_glue] (null),(null)
------------ || [_glue] (null),(null)
--------------- || [_glue] (null),(null)
------------------ || [_glue] (null),(null)
--------------------- || [argument] (a),(0)
--------------------- || [argument] (b),(1)
------------------ || [argument] (c),(2)
--------------- || [argument] (d),(3)
------------ || [argument] (e),(4)
--------- || [return] (null),(null)
------------ || [+] (null),(null)
--------------- || [+] (null),(null)
------------------ || [+] (null),(null)
--------------------- || [+] (null),(null)
------------------------ || [identifier] (a),(null)
------------------------ || [identifier] (b),(null)
--------------------- || [identifier] (c),(null)
------------------ || [identifier] (d),(null)
--------------- || [identifier] (e),(null)
------ || [_glue] (null),(null)
--------- || [var] (a),(null)
--------- || [=] (null),(null)
------------ || [array] ([ [ASTNode], [ASTNode], [ASTNode], [ASTNode], [ASTNode] ]),(null)
------------ || [leftValue] (a),(null)
--- || [_glue] (null),(null)
------ || [var] (test),(null)
------ || [=] (null),(null)
--------- || [execute function] (add),(null)
------------ || [args] ([ [ASTNode], [ASTNode], [ASTNode], [ASTNode], [ASTNode] ]),(null)
--------- || [leftValue] (test),(null)

普拉特分析法处理表达式

Pratt Parsing 是一种循环与递归相结合的算法，需要对以下两种表达式进行处理：

前缀表达式：一个操作符放在数字的前面（-a，!a，"a"，[a] ……）
中缀表达式：操作符在两个操作数的中间（a + b，a * c ……）

function prefix(type){ /* 处理前缀 */
    getNextToken();
    let right = parseExpression(precedenceList.prefix);
    putBackToken(gData.token);
    return new ASTNode().initUnaryNode(type,right,null);
}

function infix(precedence,left,type){ /* 处理中缀 */
    let right = parseExpression(precedence);
    return new ASTNode().initTwoNode(type,left,right,null);
}

前缀表达式的优先级往往是最高的
这里的函数调用是一个例外：原本函数调用属于前缀表达式，但本程序在识别 token 时会将函数名识别为 identifier，将左括号识别为符号，将参数识别为 identifier，这样函数调用就被当成了中缀表达式

下面看两个例子：

var c = 7*8+9;
||_glue null null
---||var c null
---||= null null
------||+ null null
---------||* null null
------------||number 7 null
------------||number 8 null
---------||number 9 null
------||leftValue c null

+ 优先级小于 *，因此 7 * 8 先回溯生成树，接着正常处理后续表达式生成 exp + 9 的树

var c = 7*(8+9);
||_glue null null
---||var c null
---||= null null
------||* null null
---------||number 7 null
---------||+ null null
------------||number 8 null
------------||number 9 null
------||leftValue c null

() 优先级小于 *，因此继续处理直到表达式结束后回溯，依次生成 8 + 9 和 7 * exp 的树

打印树形结构

基础版：

function printTree(ast,index) {
    if (!ast) {
      return;
    }
    console.log("%s || [%s] (%s)", Array(index+1).join("-"),ast.op,ast.value);
    index+=3;
    printTree(ast.left,index);
    printTree(ast.mid,index);
    printTree(ast.right,index);
}

优化版：

let printTree = function (ast, pre=[" ──"]) {
    if (!ast) {
        return;
    }
    for (var i = 0; i < pre.length; i++) {
        process.stdout.write(pre[i]);
    }
    console.log("[%s] (%s)", ast.op,ast.value);
    
  
    var bac = pre[pre.length - 1];
    if (pre[pre.length - 1] === " ├──") {
        pre[pre.length - 1] = " │  ";
    } else {
        pre[pre.length - 1] = "    ";
    }
    
    for (var i = 0; i < pre.length; i++) {
        process.stdout.write(pre[i]);
    }
    if(ast.left || ast.mid || ast.right)
        process.stdout.write(" │  ");
    console.log();

    pre.push(" ├──");
    
    if(!ast.mid && !ast.right){
        pre[pre.length - 1] = " └──";
    }
    printTree(ast.left,pre);
    if(!ast.right){
        pre[pre.length - 1] = " └──";
    }
    printTree(ast.mid,pre);
    pre[pre.length - 1] = " └──";
    printTree(ast.right,pre);
    
    pre.pop();
    pre[pre.length - 1] = bac;
}

优化版效果图：

function add(a,b){
    return a+b;
}
var a = 1;
var b = 2;
var c = a + b*add(a,b);
log(c);

──[_glue] (null)
    │  
    ├──[_glue] (null)
    │   │  
    │   ├──[_glue] (null)
    │   │   │  
    │   │   ├──[_glue] (null)
    │   │   │   │  
    │   │   │   ├──[function] (add)
    │   │   │   │   │  
    │   │   │   │   ├──[_glue] (null)
    │   │   │   │   │   │  
    │   │   │   │   │   ├──[argument] (a)
    │   │   │   │   │   │  
    │   │   │   │   │   └──[argument] (b)
    │   │   │   │   │      
    │   │   │   │   └──[return] (null)
    │   │   │   │       │  
    │   │   │   │       └──[+] (null)
    │   │   │   │           │  
    │   │   │   │           ├──[identifier] (a)
    │   │   │   │           │  
    │   │   │   │           └──[identifier] (b)
    │   │   │   │              
    │   │   │   └──[_glue] (null)
    │   │   │       │  
    │   │   │       ├──[var] (a)
    │   │   │       │  
    │   │   │       └──[=] (null)
    │   │   │           │  
    │   │   │           ├──[number] (1)
    │   │   │           │  
    │   │   │           └──[leftValue] (a)
    │   │   │              
    │   │   └──[_glue] (null)
    │   │       │  
    │   │       ├──[var] (b)
    │   │       │  
    │   │       └──[=] (null)
    │   │           │  
    │   │           ├──[number] (2)
    │   │           │  
    │   │           └──[leftValue] (b)
    │   │              
    │   └──[_glue] (null)
    │       │  
    │       ├──[var] (c)
    │       │  
    │       └──[=] (null)
    │           │  
    │           ├──[+] (null)
    │           │   │  
    │           │   ├──[identifier] (a)
    │           │   │  
    │           │   └──[*] (null)
    │           │       │  
    │           │       ├──[identifier] (b)
    │           │       │  
    │           │       └──[execute function] (add)
    │           │           │  
    │           │           └──[args] ([ [ASTNode], [ASTNode] ])
    │           │              
    │           └──[leftValue] (c)
    │              
    └──[execute function] (log)
        │  
        └──[args] ([ [ASTNode] ])

语义分析

处理函数调用

当遇到函数调用时，可能有如下3种情况：

该函数是解释器内置的
该函数是源程序定义的
该函数没有定义

如果该函数是解释器内置的，解释器会尝试在 buildInMethods 中查找该函数：

if(buildInMethods[astNode.value]){
    let arr = [];
    let i=0;
    while (typeof argument[i] !== "undefined"){
        arr.push(argument[i]);
        ++i;
    }
    buildInMethods[astNode.value](...arr);

如果该函数是源程序定义的，解释器需要知道该函数的具体实现：

1 2	case ASTNodeTypes.T_FUNCALL: return interpretFunCallAST(astNode,scope);

在 interpretFunCallAST 会设置 fnAST.option = "run" 并再次调用 interpretAST：

childScope.set("arguments",argument,ASTNodeTypes.T_ARGUMENT);
let fnAST = scope.get(astNode.value).value;
fnAST.option = "run";
return interpretAST(fnAST,null,childScope);

再次调用 interpretAST 会进入 ASTNodeTypes.T_FUN 分支，并尝试解析该函数：

case ASTNodeTypes.T_FUN:
    if(astNode.option === "run"){
        astNode.option = "";
        break;
    }else{
        scope.add(astNode.value);
        scope.set(astNode.value,astNode,ASTNodeTypes.T_FUN);
        return;
    }

若条件 astNode.option === "run" 成立，则代表了解释器遇到了函数调用，需要知道该函数的具体实现
否则代表解释器遇到了一个新的函数，并将该函数记录在了 scope 中

后续的操作和处理 block 是相同的，整个过程的结果将会记录在 Scope 中并参与后续的解析

内置函数的设计

这里我一共写了3个内置函数：

function logIn(...argument){ /* 等同于console.log */
    let arr = [];
    let i=0;
    while (typeof argument[i] !== "undefined"){
        arr.push(argument[i] ? argument[i].value : argument[i]);
        ++i;
    }
    console.log(...arr);
}

function showIn(argument){ /* 打印Scope结构体数据 */
    console.log(argument);
}

function treeIn(argument){ /* 打印astNode树形结构 */
    printTree(argument.astnode);
}

效果如下：

function add(a,b){
    return a+b;
}

var a = "a";
var b = "b";
var c = add(a,b);

tree(add);
show(a);
show(b);
log("result is :",c);

 ──[function] (add)
     │  
     ├──[_glue] (null)
     │   │  
     │   ├──[argument] (a)
     │   │  
     │   └──[argument] (b)
     │      
     └──[return] (null)
         │  
         └──[+] (null)
             │  
             ├──[identifier] (a)
             │  
             └──[identifier] (b)
                
{
  _inner: true,
  name: 'a',
  value: 'a',
  type: 'string',
  astnode: ASTNode {
    left: null,
    mid: null,
    right: null,
    op: 'leftValue',
    value: 'a',
    option: null
  }
}
{
  _inner: true,
  name: 'b',
  value: 'b',
  type: 'string',
  astnode: ASTNode {
    left: null,
    mid: null,
    right: null,
    op: 'leftValue',
    value: 'b',
    option: null
  }
}
result is : ab