On Github GatorLUG / js-language-tooling-talk
Code → Tokens → AST → Constraints → Transformer → Interpreter
(Some of these are optional)
Code → Tokens → AST → Constraints → Transformer → Interpreter
/\s+/ => whitespace /"[^"]*"/ => string /#.+$/ => comment /[_a-z][_a-z0-9]*/i => identifier /[0-9]+/ => integer⇩
_"foo" 42bar#buzz⇩
indentifier string whitespace integer identifier comment⇩
indentifier string integer identifier
function parse(tokens) { const body = []; while (tokens.hasNext()) { body.push(parseStatement(tokens)); } return {type: 'Program', body}; } function parseStatement(tokens) { if (tokens.next().type === 'FunctionKeyword') { return parseFunction(); } else if (tokens.next().type === 'Identifier') { return parseCallExpression(); } else { throw new SyntaxError(); } } function parseFunction(tokens) { const args = []; const body = []; tokens.consume('FunctionKeyword'); tokens.consume('('); while (tokens.next() !== ')') { args.push(parseExpression()); } tokens.consume(')'); tokens.consume('{'); while (tokens.next() !== '}') { body.push(parseExpression()); } tokens.consume('}'); return {type: 'FunctionDeclaration', args, body}; } // ...
Recursive tree traversal!
program = _ body:(s:statement _ {return s;})+ { return {type: 'Program', body: body}; } _ 'whitespace' = [ \t\r\n]* statement = function / callExpression function = 'function' _ '(' _ args:(expr:expression _ {return expr;})* _ ')' _ '{' _ body:(s:statement _ {return s;})* _ '}' { return { type: 'FunctionDeclaration', args: args, body: body }; } expression = 'expr' callExpression = 'fname' _ '(' _ args:(expr:expression _ {return expr;})* _ ')' { return { type: 'CallExpression', args: args }; }
Some constructs aren't "context free", and can't be parsed this way.
def foo(): bar()
Example: Identation-based languages don't have clear start/end tokens. How do you tell when to stop parsing a construct?
Extend the lexer, and insert indent/dedent tokens!
def foo(): bar()
1,0-1,3: NAME u'def' 1,4-1,7: NAME u'foo' 1,7-1,8: OP u'(' 1,8-1,9: OP u')' 1,9-1,10: OP u':' 1,10-1,11: NEWLINE u'\n' 2,0-2,4: INDENT u' ' 2,4-2,7: NAME u'bar' 2,7-2,8: OP u'(' 2,8-2,9: OP u')' 3,0-3,0: DEDENT '' 3,0-3,0: ENDMARKER ''
Try this at home using import tokenize!
Code → Tokens → AST → Constraints → Transformer → Interpreter
JavaScript doesn't do any of this. But a linter does!
Static analysis of dynamic languages is hard.
var obj = {foo: 1, bar: 2}; console.log(obj[prompt('What property should I access?')]);
Akin to the halting problem
module.exports = function(context) { return { "MemberExpression": function(node) { if (node.object.name === "console") { context.report(node, "Unexpected console statement."); } } }; };
/* @flow */ function foo(x) { return x * 10; } foo('Hello, world!');
hello.js:5:5,19: string This type is incompatible with hello.js:3:10,15: number
/* @flow */ function foo(x: string, y: number): string { return x.length * y; } foo('Hello', 42);
hello.js:3:10,21: number This type is incompatible with hello.js:2:37,42: string
Code → Tokens → AST → Constraints → Transformer → Interpreter
function update(callback) { $.ajax({ url : 'example.com', type: 'GET', success: function(data) { console.log(data.something); callback(data); } }) }
function update() { return $.ajax('example.com').then((data) => { console.log(data.something); return data; } }
function update() { return $.ajax('example.com').then((data) => { console.log(data.something); return data; } }
async function update() { const data = await $.ajax('example.com'); console.log(data.something); return data; }
function GeometricMean(stdlib, foreign, buffer) { "use asm"; var exp = stdlib.Math.exp; var log = stdlib.Math.log; var values = new stdlib.Float64Array(buffer); function logSum(start, end) { start = start|0; end = end|0; var sum = 0.0, p = 0, q = 0; // asm.js forces byte addressing of the heap by requiring shifting by 3 for (p = start << 3, q = end << 3; (p|0) < (q|0); p = (p + 8)|0) { sum = sum + +log(values[p>>3]); } return +sum; } function geometricMean(start, end) { start = start|0; end = end|0; return +exp(+logSum(start, end) / +((end - start)|0)); } return { geometricMean: geometricMean }; }
"usually within a factor of 2 slowdown over native compilation with clang"
var output = recast.print(ast).code;
See Also: Effective JavaScript Codemods by @cpojer