gae アプリ 開発メモ

Google App Engine アプリの開発メモ / 言語: python, javascript / ビギナー

コードの構文解析ツールを作ってみた

この世にコードの構文解析ツールなんぞたくさんあるけど、javascriptやら正規表現やらの勉強がてら作ってみた。
ソースコードをいくつかの正規表現に掛けて、

  • 該当する部分
  • 該当しない部分

で振り分けしていく方法をとってみた。
これで意外とうまくいっちゃうのでびっくり。

(function() {

  function re_match(re, text, object, match, unmatch) {
    var
      result = '',
      index = 0,
      matched;

    while ((matched = re.exec(text)) != null) {
      if (index < matched.index)
        unmatch.call(object, object, text.substring(index, matched.index));

      match.call(object, object, matched[0]);

      index = re.lastIndex;
    }
    unmatch.call(object, object, text.substring(index, text.length));
  }

  CodeTokenizer = function(language, code, handler) {
    this.language = language;
    this.handler = handler;
    this.tokens = [];
    this.index = 0;
    this.split(code);
    return this;
  };

  CodeTokenizer.prototype = {
    next: function() {
      if (this.index >= this.tokens.length)
        return null;

      return this.tokens[this.index ++];
    },

    push: function(token) {
      this.tokens.push(token);
      return this;
    },

    split: function(code) {
      re_match(
        this.language.re_comments,
        code,
        this,
        this.handler.token_comment,
        this.split_bodys
      );
    },

    split_bodys: function(tokenizer, code) {
      re_match(
        this.language.re_strings,
        code,
        this,
        this.handler.token_string,
        this.split_codes
      );
    },

    split_codes: function(tokenizer, code) {
      re_match(
        this.language.re_spaces,
        code,
        this,
        this.handler.token_space,
        this.split_tokens
      );
    },

    split_tokens: function(tokenizer, code) {
      re_match(
        this.language.re_symbols,
        code,
        this,
        this.handler.token_symbol,
        this.split_words
      );
    },

    split_words: function(tokenizer, code) {
      re_match(
        this.language.re_numerics,
        code,
        this,
        this.handler.token_numeric,
        this.split_keywords
      );
    },

    split_keywords: function(tokenizer, code) {
      re_match(
        this.language.re_keywords,
        code,
        this,
        this.handler.token_keyword,
        this.handler.token_word
      );
    },

  };

  CodeLanguage = {
    javascript: function() {
      var language = {};

      language.re_comments = new RegExp([
          '/\\*(.|\\n)*?\\*/',
          '//.*'
        ].join('|'), 'g');

      language.re_strings = new RegExp([
        '".*?"',
        "'.*?'",
      ].join('|'), 'g');

      language.re_spaces = new RegExp('\\s+', 'g');

      language.re_symbols = new RegExp('(' + [
        '!',
        '=',
        '\\(',
        '\\)',
        '<',
        '>',
        '\\[',
        '\\]',
        '\\+',
        '-',
        '\\*',
        '/',
        '%',
        '\\.',
        ',',
        ';',
        '\\?',
        ':',
        '&',
        '\\|',
      ].join('|') + ')+', 'g');

      language.re_numerics = new RegExp('^[0-9]+$', 'g');

      language.re_keywords = new RegExp('^(' + [
        'abstract',
        'boolean',
        'break',
        'byte',
        'case',
        'catch',
        'char',
        'class',
        'const',
        'continue',
        'debugger',
        'default',
        'delete',
        'do',
        'double',
        'else',
        'enum',
        'export',
        'extends',
        'final',
        'finally',
        'float',
        'for',
        'function',
        'goto',
        'if',
        'implements',
        'import',
        'in',
        'instanceof',
        'int',
        'interface',
        'long',
        'native',
        'new',
        'null',
        'package',
        'private',
        'protected',
        'public',
        'return',
        'short',
        'static',
        'super',
        'switch',
        'synchronized',
        'this',
        'throw',
        'throws',
        'transient',
        'try',
        'typeof',
        'var',
        'void',
        'volatile',
        'while',
        'with',
      ].join('|') + ')$', "g");

      return language;
    },
  };
})();