コードの構文解析ツールを作ってみた
この世にコードの構文解析ツールなんぞたくさんあるけど、javascriptやら正規表現やらの勉強がてら作ってみた。
ソースコードをいくつかの正規表現に掛けて、
- 該当する部分
- 該当しない部分
で振り分けしていく方法をとってみた。
これで意外とうまくいっちゃうのでびっくり。
(function() { function re_match(re, text, object, match, unmatch) { var result = '', index = 0, matched; while ((matched = re.exec(text)) != null) { if (index < matched.index) unmatch.call(object, object, text.substring(index, matched.index)); match.call(object, object, matched[0]); index = re.lastIndex; } unmatch.call(object, object, text.substring(index, text.length)); } CodeTokenizer = function(language, code, handler) { this.language = language; this.handler = handler; this.tokens = []; this.index = 0; this.split(code); return this; }; CodeTokenizer.prototype = { next: function() { if (this.index >= this.tokens.length) return null; return this.tokens[this.index ++]; }, push: function(token) { this.tokens.push(token); return this; }, split: function(code) { re_match( this.language.re_comments, code, this, this.handler.token_comment, this.split_bodys ); }, split_bodys: function(tokenizer, code) { re_match( this.language.re_strings, code, this, this.handler.token_string, this.split_codes ); }, split_codes: function(tokenizer, code) { re_match( this.language.re_spaces, code, this, this.handler.token_space, this.split_tokens ); }, split_tokens: function(tokenizer, code) { re_match( this.language.re_symbols, code, this, this.handler.token_symbol, this.split_words ); }, split_words: function(tokenizer, code) { re_match( this.language.re_numerics, code, this, this.handler.token_numeric, this.split_keywords ); }, split_keywords: function(tokenizer, code) { re_match( this.language.re_keywords, code, this, this.handler.token_keyword, this.handler.token_word ); }, }; CodeLanguage = { javascript: function() { var language = {}; language.re_comments = new RegExp([ '/\\*(.|\\n)*?\\*/', '//.*' ].join('|'), 'g'); language.re_strings = new RegExp([ '".*?"', "'.*?'", ].join('|'), 'g'); language.re_spaces = new RegExp('\\s+', 'g'); language.re_symbols = new RegExp('(' + [ '!', '=', '\\(', '\\)', '<', '>', '\\[', '\\]', '\\+', '-', '\\*', '/', '%', '\\.', ',', ';', '\\?', ':', '&', '\\|', ].join('|') + ')+', 'g'); language.re_numerics = new RegExp('^[0-9]+$', 'g'); language.re_keywords = new RegExp('^(' + [ 'abstract', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class', 'const', 'continue', 'debugger', 'default', 'delete', 'do', 'double', 'else', 'enum', 'export', 'extends', 'final', 'finally', 'float', 'for', 'function', 'goto', 'if', 'implements', 'import', 'in', 'instanceof', 'int', 'interface', 'long', 'native', 'new', 'null', 'package', 'private', 'protected', 'public', 'return', 'short', 'static', 'super', 'switch', 'synchronized', 'this', 'throw', 'throws', 'transient', 'try', 'typeof', 'var', 'void', 'volatile', 'while', 'with', ].join('|') + ')$', "g"); return language; }, }; })();