On Github GatorLUG / js-language-tooling-talk
Code → Tokens → AST → Constraints → Transformer → Interpreter
(Some of these are optional)
Code → Tokens → AST → Constraints → Transformer → Interpreter
/\s+/ => whitespace /"[^"]*"/ => string /#.+$/ => comment /[_a-z][_a-z0-9]*/i => identifier /[0-9]+/ => integer⇩
_"foo" 42bar#buzz⇩
indentifier string whitespace integer identifier comment⇩
indentifier string integer identifier
function parse(tokens) {
const body = [];
while (tokens.hasNext()) {
body.push(parseStatement(tokens));
}
return {type: 'Program', body};
}
function parseStatement(tokens) {
if (tokens.next().type === 'FunctionKeyword') {
return parseFunction();
} else if (tokens.next().type === 'Identifier') {
return parseCallExpression();
} else {
throw new SyntaxError();
}
}
function parseFunction(tokens) {
const args = [];
const body = [];
tokens.consume('FunctionKeyword');
tokens.consume('(');
while (tokens.next() !== ')') {
args.push(parseExpression());
}
tokens.consume(')');
tokens.consume('{');
while (tokens.next() !== '}') {
body.push(parseExpression());
}
tokens.consume('}');
return {type: 'FunctionDeclaration', args, body};
}
// ...
Recursive tree traversal!
program
= _ body:(s:statement _ {return s;})+ {
return {type: 'Program', body: body};
}
_ 'whitespace'
= [ \t\r\n]*
statement
= function
/ callExpression
function
= 'function' _ '('
_ args:(expr:expression _ {return expr;})*
_ ')' _ '{'
_ body:(s:statement _ {return s;})*
_ '}' {
return {
type: 'FunctionDeclaration',
args: args,
body: body
};
}
expression
= 'expr'
callExpression
= 'fname' _ '('
_ args:(expr:expression _ {return expr;})*
_ ')' {
return {
type: 'CallExpression',
args: args
};
}
Some constructs aren't "context free", and can't be parsed this way.
def foo():
bar()
Example: Identation-based languages don't have clear start/end tokens. How do you tell when to stop parsing a construct?
Extend the lexer, and insert indent/dedent tokens!
def foo():
bar()
1,0-1,3: NAME u'def'
1,4-1,7: NAME u'foo'
1,7-1,8: OP u'('
1,8-1,9: OP u')'
1,9-1,10: OP u':'
1,10-1,11: NEWLINE u'\n'
2,0-2,4: INDENT u' '
2,4-2,7: NAME u'bar'
2,7-2,8: OP u'('
2,8-2,9: OP u')'
3,0-3,0: DEDENT ''
3,0-3,0: ENDMARKER ''
Try this at home using import tokenize!
Code → Tokens → AST → Constraints → Transformer → Interpreter
JavaScript doesn't do any of this. But a linter does!
Static analysis of dynamic languages is hard.
var obj = {foo: 1, bar: 2};
console.log(obj[prompt('What property should I access?')]);
Akin to the halting problem
module.exports = function(context) {
return {
"MemberExpression": function(node) {
if (node.object.name === "console") {
context.report(node, "Unexpected console statement.");
}
}
};
};
/* @flow */
function foo(x) {
return x * 10;
}
foo('Hello, world!');
hello.js:5:5,19: string This type is incompatible with hello.js:3:10,15: number
/* @flow */
function foo(x: string, y: number): string {
return x.length * y;
}
foo('Hello', 42);
hello.js:3:10,21: number This type is incompatible with hello.js:2:37,42: string
Code → Tokens → AST → Constraints → Transformer → Interpreter
function update(callback) {
$.ajax({
url : 'example.com',
type: 'GET',
success: function(data) {
console.log(data.something);
callback(data);
}
})
}
function update() {
return $.ajax('example.com').then((data) => {
console.log(data.something);
return data;
}
}
function update() {
return $.ajax('example.com').then((data) => {
console.log(data.something);
return data;
}
}
async function update() {
const data = await $.ajax('example.com');
console.log(data.something);
return data;
}
function GeometricMean(stdlib, foreign, buffer) {
"use asm";
var exp = stdlib.Math.exp;
var log = stdlib.Math.log;
var values = new stdlib.Float64Array(buffer);
function logSum(start, end) {
start = start|0;
end = end|0;
var sum = 0.0, p = 0, q = 0;
// asm.js forces byte addressing of the heap by requiring shifting by 3
for (p = start << 3, q = end << 3; (p|0) < (q|0); p = (p + 8)|0) {
sum = sum + +log(values[p>>3]);
}
return +sum;
}
function geometricMean(start, end) {
start = start|0;
end = end|0;
return +exp(+logSum(start, end) / +((end - start)|0));
}
return { geometricMean: geometricMean };
}
"usually within a factor of 2 slowdown over native compilation with clang"
var output = recast.print(ast).code;
See Also: Effective JavaScript Codemods by @cpojer