content / test / data / tree_parser_util.js [blame]

// Copyright 2015 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

/**
 * Parser for a simple grammar that describes a tree structure using a function-
 * like "a(b(c,d))" syntax. Original intended usage: to have browsertests
 * specify an arbitrary tree of iframes, loaded from various sites, without
 * having to write a .html page for each level or do crazy feats of data: url
 * escaping. But there's nothing really iframe-specific here. See below for some
 * examples of the grammar and the parser output.
 *
 * @example <caption>Basic syntax: an identifier, optionally followed by a list
 * of attributes, optionally followed by a list of children.</caption>
 * // returns { value: 'abc', attributes: [], children: [] }
 * TreeParserUtil.parse('abc {} ()');
 *
 * @example <caption>Both the attribute and child lists are optional. Dots and
 * hyphens are legal in ids.</caption>
 * // returns { value: 'example-b.com', attributes: [], children: [] }
 * TreeParserUtil.parse('example-b.com');
 *
 * @example <caption>Attributes are identifiers as well, separated by commas.
 * </caption>
 * // returns { value: 'abc', attributes: ['attr-1', 'attr-2'], children: [] }
 * TreeParserUtil.parse('abc {attr-1, attr-2}');
 *
 * @example <caption>Commas separate children in the child list.</caption>
 * // returns { value: 'b', attributes: [], children: [
 * //           { value: 'c', attributes: [], children: [] },
 * //           { value: 'd', attributes: [], children: [] }
 * //         ]}
 * TreeParserUtil.parse('b (c, d)';
 *
 * @example <caption>Children can have children, and so on.</caption>
 * // returns { value: 'e', attributes: [], children: [
 * //           { value: 'f', attributes: [], children: [
 * //             { value: 'g', attributes: [], children: [
 * //               { value: 'h', attributes: [], children: [] },
 * //               { value: 'i', attributes: [], children: [
 * //                 { value: 'j', attributes: [], children: [] }
 * //               ]},
 * //             ]}
 * //           ]}
 * //         ]}
 * TreeParserUtil.parse('e(f(g(h(),i(j))))';
 *
 * @example <caption>Attributes can be applied to children at any level of
 * nesting.</caption>
 * // returns { value: 'b', attributes: ['red', 'blue'], children: [
 * //           { value: 'c', attributes: [], children: [] },
 * //           { value: 'd', attributes: ['green'], children: [] }
 * //         ]}
 * TreeParserUtil.parse('b{red,blue}(c,d{green})';
 *
 * @example <caption>flatten() converts a [sub]tree back to a string.</caption>
 * var tree = TreeParserUtil.parse('b.com (c.com(e.com), d.com)');
 * TreeParserUtil.flatten(tree.children[0]);  // returns 'c.com(e.com())'
 */
var TreeParserUtil = (function() {
  'use strict';

  /**
   * Parses an input string into a tree. See class comment for examples.
   * @returns A tree of the form {value: <string>, children: <Array.<tree>>}.
   */
  function parse(input) {
    var tokenStream = lex(input);

    var result = takeIdAndChild(tokenStream);
    if (tokenStream.length != 0)
      throw new Error('Expected end of stream, but found "' +
                      tokenStream[0] + '".')
    return result;
  }

  /**
   * Inverse of |parse|. Converts a parsed tree object into a string. Can be
   * used to forward a subtree as an argument to a nested document.
   */
  function flatten(tree) {
    var result = tree.value;
    if (tree.attributes && tree.attributes.length)
      result += '{' + tree.attributes.join(",") + "}";
    return result + '(' + tree.children.map(flatten).join(',') + ')';
  }

  /**
   * Lexer function to convert an input string into a token stream.  Splits the
   * input along whitespace, parens and commas. Whitespace is removed, while
   * parens and commas are preserved as standalone tokens.
   *
   * @param {string} input The input string.
   * @return {Array.<string>} The resulting token stream.
   */
  function lex(input) {
    return input.split(/(\s+|\(|\)|{|}|,)/).reduce(
      function (resultArray, token) {
        var trimmed = token.trim();
        if (trimmed) {
          resultArray.push(trimmed);
        }
        return resultArray;
      }, []);
  }

  /**
   * Consumes from the stream an identifier with optional attribute and child
   * lists, returning its parsed representation.
   */
  function takeIdAndChild(tokenStream) {
    return { value: takeIdentifier(tokenStream),
             attributes: takeAttributeList(tokenStream),
             children: takeChildList(tokenStream) };
  }

  /**
   * Consumes from the stream an identifier, returning its value (as a string).
   */
  function takeIdentifier(tokenStream) {
    if (tokenStream.length == 0)
      throw new Error('Expected an identifier, but found end-of-stream.');
    var token = tokenStream.shift();
    // This regex includes slash, question mark, ampersand, and period in order
    // to accommodate full URLs, which may be passed to
    // cross_site_iframe_factory.html as leaf node locations. Colon is for port
    // numbers, which may also be specified. Semicolon and equals are for cookie
    // strings, which may sometimes be provided in URL query strings such as for
    // the EmbeddedTestServer default handler /set-cookie. Percent is for
    // allowing escaped squences in the URL, like %2F.
    const re = new RegExp('^[a-zA-Z0-9\\/%_?&;:.=-]+$');
    if (!token.match(re))
      throw new Error('Expected an identifier, but found "' + token + '".');
    return token;
  }

  /**
   * Consumes an optional attribute list from the token stream, returning a list
   * of the parsed attribute identifiers.
   */
  function takeAttributeList(tokenStream) {
    // Remove the next token from the stream if it matches |token|.
    function tryToEatA(token) {
      if (tokenStream[0] == token) {
        tokenStream.shift();
        return true;
      }
      return false;
    }

    // Bare identifier case, as in 'b' in the input '(a (b, c))'
    if (!tryToEatA('{'))
      return [];

    // Empty list case, as in 'b' in the input 'a (b {}, c)'.
    if (tryToEatA('}')) {
      return [];
    }

    // List with at least one entry.
    var result = [ takeIdentifier(tokenStream) ];

    // Additional entries allowed with comma.
    while (tryToEatA(',')) {
      result.push(takeIdentifier(tokenStream));
    }

    // End of list.
    if (tryToEatA('}')) {
      return result;
    }
    if (tokenStream.length == 0)
      throw new Error('Expected "}" or ",", but found end-of-stream.');
    throw new Error('Expected "}" or ",", but found "' + tokenStream[0] + '".');
  }

  /**
   * Consumes an optional child list from the token stream, returning a list of
   * the parsed children.
   */
  function takeChildList(tokenStream) {
    // Remove the next token from the stream if it matches |token|.
    function tryToEatA(token) {
      if (tokenStream[0] == token) {
        tokenStream.shift();
        return true;
      }
      return false;
    }

    // Bare identifier case, as in 'b' in the input '(a (b, c))'
    if (!tryToEatA('('))
      return [];

    // Empty list case, as in 'b' in the input 'a (b (), c)'.
    if (tryToEatA(')')) {
      return [];
    }

    // List with at least one entry.
    var result = [ takeIdAndChild(tokenStream) ];

    // Additional entries allowed with commas.
    while (tryToEatA(',')) {
      result.push(takeIdAndChild(tokenStream));
    }

    // End of list.
    if (tryToEatA(')')) {
      return result;
    }
    if (tokenStream.length == 0)
      throw new Error('Expected ")" or ",", but found end-of-stream.');
    throw new Error('Expected ")" or ",", but found "' + tokenStream[0] + '".');
  }

  return {
    parse:   parse,
    flatten: flatten
  };
})();

document.scriptExecuted = true;