1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229
  230
  231
  232
  233
  234
  235
  236
  237
  238
  239
  240
  241
  242
  243
  244
  245
  246
  247
  248
  249
  250
  251
  252
  253
  254
  255
  256
  257
  258
  259
  260
  261
  262
  263
  264
  265
  266
  267
  268
  269
  270
  271
  272
  273
  274
  275
  276
  277
  278
  279
  280
  281
  282
  283
  284
  285
  286
  287
  288
  289
  290
  291
  292
  293
  294
  295
  296
  297
  298
  299
  300
  301
  302
  303
  304
  305
  306
  307
  308
  309
  310
  311
  312
  313
  314
  315
  316
  317
  318
  319
  320
  321
  322
  323
  324
  325
  326
  327
  328
  329
  330
  331
  332
  333
  334
  335
  336
  337
  338
  339
  340
  341
  342
  343
  344
  345
  346
  347
  348
  349
  350
  351
  352
  353
  354
  355
  356
  357
  358
  359
  360
  361
  362
  363
  364
  365
  366
  367
  368
  369
  370
  371
  372
  373
  374
  375
  376
  377
  378
  379
  380
  381
  382
  383
  384
  385
  386
  387
  388
  389
  390
  391
  392
  393
  394
  395
  396
  397
  398
  399
  400
  401
  402
  403
  404
  405
  406
  407
  408
  409
  410
  411
  412
  413
  414
  415
  416
  417
  418
  419
  420
  421
  422
  423
  424
  425
  426
  427
  428
  429
  430
  431
  432
  433
  434
  435
  436
  437
  438
  439
  440
  441
  442
  443
  444
  445
  446
  447
  448
  449
  450
  451
  452
  453
  454
  455
  456
  457
  458
  459
  460
  461
  462
  463
  464
  465
  466
  467
  468
  469
  470
  471

base / i18n / build_utf8_validator_tables.cc [blame]

// Copyright 2014 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
#pragma allow_unsafe_buffers
#endif

// Create a state machine for validating UTF-8. The algorithm in brief:
// 1. Convert the complete unicode range of code points, except for the
//    surrogate code points, to an ordered array of sequences of bytes in
//    UTF-8.
// 2. Convert individual bytes to ranges, starting from the right of each byte
//    sequence. For each range, ensure the bytes on the left and the ranges
//    on the right are the identical.
// 3. Convert the resulting list of ranges into a state machine, collapsing
//    identical states.
// 4. Convert the state machine to an array of bytes.
// 5. Output as a C++ file.
//
// To use:
//  $ ninja -C out/Release build_utf8_validator_tables
//  $ out/Release/build_utf8_validator_tables
//                                   --output=base/i18n/utf8_validator_tables.cc
//  $ git add base/i18n/utf8_validator_tables.cc
//
// Because the table is not expected to ever change, it is checked into the
// repository rather than being regenerated at build time.
//
// This code uses type uint8_t throughout to represent bytes, to avoid
// signed/unsigned char confusion.

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <algorithm>
#include <map>
#include <string>
#include <vector>

#include "base/command_line.h"
#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/logging.h"
#include "base/memory/raw_ptr.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/stringprintf.h"
#include "third_party/icu/source/common/unicode/utf8.h"

namespace {

const char kHelpText[] =
    "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";

const char kProlog[] =
    "// Copyright 2013 The Chromium Authors\n"
    "// Use of this source code is governed by a BSD-style license that can "
    "be\n"
    "// found in the LICENSE file.\n"
    "\n"
    "// This file is auto-generated by build_utf8_validator_tables.\n"
    "// DO NOT EDIT.\n"
    "\n"
    "#include \"base/i18n/utf8_validator_tables.h\"\n"
    "\n"
    "namespace base {\n"
    "namespace internal {\n"
    "\n"
    "const uint8_t kUtf8ValidatorTables[] = {\n";

const char kEpilog[] =
    "};\n"
    "\n"
    "const size_t kUtf8ValidatorTablesSize = "
    "std::size(kUtf8ValidatorTables);\n"
    "\n"
    "}  // namespace internal\n"
    "}  // namespace base\n";

// Ranges are inclusive at both ends--they represent [from, to]
class Range {
 public:
  // Ranges always start with just one byte.
  explicit Range(uint8_t value) : from_(value), to_(value) {}

  // Range objects are copyable and assignable to be used in STL
  // containers. Since they only contain non-pointer POD types, the default copy
  // constructor, assignment operator and destructor will work.

  // Add a byte to the range. We intentionally only support adding a byte at the
  // end, since that is the only operation the code needs.
  void AddByte(uint8_t to) {
    CHECK(to == to_ + 1);
    to_ = to;
  }

  uint8_t from() const { return from_; }
  uint8_t to() const { return to_; }

  bool operator<(const Range& rhs) const {
    return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
  }

  bool operator==(const Range& rhs) const {
    return from() == rhs.from() && to() == rhs.to();
  }

 private:
  uint8_t from_;
  uint8_t to_;
};

// A vector of Ranges is like a simple regular expression--it corresponds to
// a set of strings of the same length that have bytes in each position in
// the appropriate range.
typedef std::vector<Range> StringSet;

// A UTF-8 "character" is represented by a sequence of bytes.
typedef std::vector<uint8_t> Character;

// In the second stage of the algorithm, we want to convert a large list of
// Characters into a small list of StringSets.
struct Pair {
  Character character;
  StringSet set;
};

typedef std::vector<Pair> PairVector;

// A class to print a table of numbers in the same style as clang-format.
class TablePrinter {
 public:
  explicit TablePrinter(FILE* stream)
      : stream_(stream), values_on_this_line_(0), current_offset_(0) {}

  TablePrinter(const TablePrinter&) = delete;
  TablePrinter& operator=(const TablePrinter&) = delete;

  void PrintValue(uint8_t value) {
    if (values_on_this_line_ == 0) {
      fputs("   ", stream_);
    } else if (values_on_this_line_ == kMaxValuesPerLine) {
      fprintf(stream_.get(), "  // 0x%02x\n   ", current_offset_);
      values_on_this_line_ = 0;
    }
    fprintf(stream_.get(), " 0x%02x,", static_cast<int>(value));
    ++values_on_this_line_;
    ++current_offset_;
  }

  void NewLine() {
    while (values_on_this_line_ < kMaxValuesPerLine) {
      fputs("      ", stream_);
      ++values_on_this_line_;
    }
    fprintf(stream_.get(), "  // 0x%02x\n", current_offset_);
    values_on_this_line_ = 0;
  }

 private:
  // stdio stream. Not owned.
  raw_ptr<FILE> stream_;

  // Number of values so far printed on this line.
  int values_on_this_line_;

  // Total values printed so far.
  int current_offset_;

  static const int kMaxValuesPerLine = 8;
};

// Start by filling a PairVector with characters. The resulting vector goes from
// "\x00" to "\xf4\x8f\xbf\xbf".
PairVector InitializeCharacters() {
  PairVector vector;
  for (int i = 0; i <= 0x10FFFF; ++i) {
    if (i >= 0xD800 && i < 0xE000) {
      // Surrogate codepoints are not permitted. Non-character code points are
      // explicitly permitted.
      continue;
    }
    uint8_t bytes[4];
    unsigned int offset = 0;
    UBool is_error = false;
    U8_APPEND(bytes, offset, std::size(bytes), i, is_error);
    DCHECK(!is_error);
    DCHECK_GT(offset, 0u);
    DCHECK_LE(offset, std::size(bytes));
    Pair pair = {Character(bytes, bytes + offset), StringSet()};
    vector.push_back(pair);
  }
  return vector;
}

// Construct a new Pair from |character| and the concatenation of |new_range|
// and |existing_set|, and append it to |pairs|.
void ConstructPairAndAppend(const Character& character,
                            const Range& new_range,
                            const StringSet& existing_set,
                            PairVector* pairs) {
  Pair new_pair = {character, StringSet(1, new_range)};
  new_pair.set.insert(
      new_pair.set.end(), existing_set.begin(), existing_set.end());
  pairs->push_back(new_pair);
}

// Each pass over the PairVector strips one byte off the right-hand-side of the
// characters and adds a range to the set on the right. For example, the first
// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
// [\xa0-\xbf][\x80-\xbf]).
void MoveRightMostCharToSet(PairVector* pairs) {
  PairVector new_pairs;
  PairVector::const_iterator it = pairs->begin();
  while (it != pairs->end() && it->character.empty()) {
    new_pairs.push_back(*it);
    ++it;
  }
  CHECK(it != pairs->end());
  Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
  Range new_range(it->character.back());
  StringSet converted = it->set;
  ++it;
  while (it != pairs->end()) {
    const Pair& current_pair = *it++;
    if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
        std::equal(unconverted_bytes.begin(),
                   unconverted_bytes.end(),
                   current_pair.character.begin()) &&
        converted == current_pair.set) {
      // The particular set of UTF-8 codepoints we are validating guarantees
      // that each byte range will be contiguous. This would not necessarily be
      // true for an arbitrary set of UTF-8 codepoints.
      DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
      new_range.AddByte(current_pair.character.back());
      continue;
    }
    ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
    unconverted_bytes = Character(current_pair.character.begin(),
                                  current_pair.character.end() - 1);
    new_range = Range(current_pair.character.back());
    converted = current_pair.set;
  }
  ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
  new_pairs.swap(*pairs);
}

void MoveAllCharsToSets(PairVector* pairs) {
  // Since each pass of the function moves one character, and UTF-8 sequences
  // are at most 4 characters long, this simply runs the algorithm four times.
  for (int i = 0; i < 4; ++i) {
    MoveRightMostCharToSet(pairs);
  }
#if DCHECK_IS_ON()
  for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
       ++it) {
    DCHECK(it->character.empty());
  }
#endif
}

// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
// algorithm is working. Use the command-line option
// --vmodule=build_utf8_validator_tables=1 to see this output.
void LogStringSets(const PairVector& pairs) {
  for (const auto& pair_it : pairs) {
    std::string set_as_string;
    for (auto set_it = pair_it.set.begin(); set_it != pair_it.set.end();
         ++set_it) {
      set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
                                          static_cast<int>(set_it->from()),
                                          static_cast<int>(set_it->to()));
    }
    VLOG(1) << set_as_string;
  }
}

// A single state in the state machine is represented by a sorted vector of
// start bytes and target states. All input bytes in the range between the start
// byte and the next entry in the vector (or 0xFF) result in a transition to the
// target state.
struct StateRange {
  uint8_t from;
  uint8_t target_state;
};

typedef std::vector<StateRange> State;

// Generates a state where all bytes go to state 1 (invalid). This is also used
// as an initialiser for other states (since bytes from outside the desired
// range are invalid).
State GenerateInvalidState() {
  const StateRange range = {0, 1};
  return State(1, range);
}

// A map from a state (ie. a set of strings which will match from this state) to
// a number (which is an index into the array of states).
typedef std::map<StringSet, uint8_t> StateMap;

// Create a new state corresponding to |set|, add it |states| and |state_map|
// and return the index it was given in |states|.
uint8_t MakeState(const StringSet& set,
                  std::vector<State>* states,
                  StateMap* state_map) {
  DCHECK(!set.empty());
  const Range& range = set.front();
  const StringSet rest(set.begin() + 1, set.end());
  const StateMap::const_iterator where = state_map->find(rest);
  const uint8_t target_state = where == state_map->end()
                                   ? MakeState(rest, states, state_map)
                                   : where->second;
  DCHECK_LT(0, range.from());
  DCHECK_LT(range.to(), 0xFF);
  const StateRange new_state_initializer[] = {
      {0, 1},
      {range.from(), target_state},
      {static_cast<uint8_t>(range.to() + 1), 1}};
  states->push_back(
      State(new_state_initializer,
            new_state_initializer + std::size(new_state_initializer)));
  const uint8_t new_state_number =
      base::checked_cast<uint8_t>(states->size() - 1);
  CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
  return new_state_number;
}

std::vector<State> GenerateStates(const PairVector& pairs) {
  // States 0 and 1 are the initial/valid state and invalid state, respectively.
  std::vector<State> states(2, GenerateInvalidState());
  StateMap state_map;
  state_map.insert(std::make_pair(StringSet(), 0));
  for (auto it = pairs.begin(); it != pairs.end(); ++it) {
    DCHECK(it->character.empty());
    DCHECK(!it->set.empty());
    const Range& range = it->set.front();
    const StringSet rest(it->set.begin() + 1, it->set.end());
    const StateMap::const_iterator where = state_map.find(rest);
    const uint8_t target_state = where == state_map.end()
                                     ? MakeState(rest, &states, &state_map)
                                     : where->second;
    if (states[0].back().from == range.from()) {
      DCHECK_EQ(1, states[0].back().target_state);
      states[0].back().target_state = target_state;
      DCHECK_LT(range.to(), 0xFF);
      const StateRange new_range = {static_cast<uint8_t>(range.to() + 1), 1};
      states[0].push_back(new_range);
    } else {
      DCHECK_LT(range.to(), 0xFF);
      const StateRange new_range_initializer[] = {
          {range.from(), target_state},
          {static_cast<uint8_t>(range.to() + 1), 1}};
      states[0].insert(
          states[0].end(), new_range_initializer,
          new_range_initializer + std::size(new_range_initializer));
    }
  }
  return states;
}

// Output the generated states as a C++ table. Two tricks are used to compact
// the table: each state in the table starts with a shift value which indicates
// how many bits we can discard from the right-hand-side of the byte before
// doing the table lookup. Secondly, only the state-transitions for bytes
// with the top-bit set are included in the table; bytes without the top-bit set
// are just ASCII and are handled directly by the code.
void PrintStates(const std::vector<State>& states, FILE* stream) {
  // First calculate the start-offset of each state. This allows the state
  // machine to jump directly to the correct offset, avoiding an extra
  // indirection. State 0 starts at offset 0.
  std::vector<uint8_t> state_offset(1, 0);
  std::vector<uint8_t> shifts;
  uint8_t pos = 0;

  for (const auto& state_it : states) {
    // We want to set |shift| to the (0-based) index of the least-significant
    // set bit in any of the ranges for this state, since this tells us how many
    // bits we can discard and still determine what range a byte lies in. Sadly
    // it appears that ffs() is not portable, so we do it clumsily.
    uint8_t shift = 7;
    for (auto range_it = state_it.begin(); range_it != state_it.end();
         ++range_it) {
      while (shift > 0 && range_it->from % (1 << shift) != 0) {
        --shift;
      }
    }
    shifts.push_back(shift);
    pos += 1 + (1 << (7 - shift));
    state_offset.push_back(pos);
  }

  DCHECK_EQ(129, state_offset[1]);

  fputs(kProlog, stream);
  TablePrinter table_printer(stream);

  for (uint8_t state_index = 0; state_index < states.size(); ++state_index) {
    const uint8_t shift = shifts[state_index];
    uint8_t next_range = 0;
    uint8_t target_state = 1;
    fprintf(stream,
            "    // State %d, offset 0x%02x\n",
            static_cast<int>(state_index),
            static_cast<int>(state_offset[state_index]));
    table_printer.PrintValue(shift);
    for (int i = 0; i < 0x100; i += (1 << shift)) {
      if (next_range < states[state_index].size() &&
          states[state_index][next_range].from == i) {
        target_state = states[state_index][next_range].target_state;
        ++next_range;
      }
      if (i >= 0x80) {
        table_printer.PrintValue(state_offset[target_state]);
      }
    }
    table_printer.NewLine();
  }

  fputs(kEpilog, stream);
}

}  // namespace

int main(int argc, char* argv[]) {
  base::CommandLine::Init(argc, argv);
  logging::LoggingSettings settings;
  settings.logging_dest =
      logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR;
  logging::InitLogging(settings);
  if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {
    fwrite(kHelpText, 1, std::size(kHelpText), stdout);
    exit(EXIT_SUCCESS);
  }
  base::FilePath filename =
      base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");

  FILE* output = stdout;
  if (!filename.empty()) {
    output = base::OpenFile(filename, "wb");
    if (!output)
      PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
                  << "' for writing";
  }

  // Step 1: Enumerate the characters
  PairVector pairs = InitializeCharacters();
  // Step 2: Convert to sets.
  MoveAllCharsToSets(&pairs);
  if (VLOG_IS_ON(1)) {
    LogStringSets(pairs);
  }
  // Step 3: Generate states.
  std::vector<State> states = GenerateStates(pairs);
  // Step 4/5: Print output
  PrintStates(states, output);

  if (!filename.empty()) {
    if (!base::CloseFile(output))
      PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
                  << "'";
  }

  return EXIT_SUCCESS;
}