1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201

url / url_parse_file.cc [blame]

// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <string_view>

#include "base/check.h"
#include "url/third_party/mozilla/url_parse.h"
#include "url/url_file.h"
#include "url/url_parse_internal.h"

// Interesting IE file:isms...
//
//  INPUT                      OUTPUT
//  =========================  ==============================
//  file:/foo/bar              file:///foo/bar
//      The result here seems totally invalid!?!? This isn't UNC.
//
//  file:/
//  file:// or any other number of slashes
//      IE6 doesn't do anything at all if you click on this link. No error:
//      nothing. IE6's history system seems to always color this link, so I'm
//      guessing that it maps internally to the empty URL.
//
//  C:\                        file:///C:/
//      When on a file: URL source page, this link will work. When over HTTP,
//      the file: URL will appear in the status bar but the link will not work
//      (security restriction for all file URLs).
//
//  file:foo/                  file:foo/     (invalid?!?!?)
//  file:/foo/                 file:///foo/  (invalid?!?!?)
//  file://foo/                file://foo/   (UNC to server "foo")
//  file:///foo/               file:///foo/  (invalid, seems to be a file)
//  file:////foo/              file://foo/   (UNC to server "foo")
//      Any more than four slashes is also treated as UNC.
//
//  file:C:/                   file://C:/
//  file:/C:/                  file://C:/
//      The number of slashes after "file:" don't matter if the thing following
//      it looks like an absolute drive path. Also, slashes and backslashes are
//      equally valid here.

namespace url {

namespace {

// Returns the index of the next slash in the input after the given index, or
// `spec.size()` if the end of the input is reached.
template <typename CharT>
size_t FindNextSlash(std::basic_string_view<CharT> spec, size_t begin_index) {
  size_t idx = begin_index;
  while (idx < spec.size() && !IsSlashOrBackslash(spec[idx])) {
    idx++;
  }
  return idx;
}

// A subcomponent of DoParseFileURL, the input of this function should be a UNC
// path name, with the index of the first character after the slashes following
// the scheme given in `after_slashes`. This will initialize the host, path,
// query, and ref, and leave the other output components untouched
// (DoParseFileURL handles these for us).
template <typename CharT>
void DoParseUNC(std::basic_string_view<CharT> url,
                size_t after_slashes,
                Parsed* parsed) {
  int url_len = base::checked_cast<int>(url.size());
  // The cast is safe because `FindNextSlash` will never return anything longer
  // than `url_len`.
  int next_slash = static_cast<int>(FindNextSlash(url, after_slashes));

  // Everything up until that first slash we found (or end of string) is the
  // host name, which will end up being the UNC host. For example,
  // "file://foo/bar.txt" will get a server name of "foo" and a path of "/bar".
  // Later, on Windows, this should be treated as the filename "\\foo\bar.txt"
  // in proper UNC notation.
  if (after_slashes < static_cast<size_t>(next_slash)) {
    parsed->host = MakeRange(after_slashes, next_slash);
  } else {
    parsed->host.reset();
  }
  if (next_slash < url_len) {
    ParsePathInternal(url.data(), MakeRange(next_slash, url_len), &parsed->path,
                      &parsed->query, &parsed->ref);
  } else {
    parsed->path.reset();
  }
}

// A subcomponent of DoParseFileURL, the input should be a local file, with the
// beginning of the path indicated by the index in `path_begin`. This will
// initialize the host, path, query, and ref, and leave the other output
// components untouched (DoParseFileURL handles these for us).
template <typename CharT>
void DoParseLocalFile(std::basic_string_view<CharT> url,
                      int path_begin,
                      Parsed* parsed) {
  parsed->host.reset();
  ParsePathInternal(url.data(),
                    MakeRange(path_begin, base::checked_cast<int>(url.size())),
                    &parsed->path, &parsed->query, &parsed->ref);
}

// Backend for the external functions that operates on either char type.
// Handles cases where there is a scheme, but also when handed the first
// character following the "file:" at the beginning of the spec. If so,
// this is usually a slash, but needn't be; we allow paths like "file:c:\foo".
template <typename CharT>
Parsed DoParseFileURL(std::basic_string_view<CharT> url) {
  // Strip leading & trailing spaces and control characters.
  int begin = 0;
  int url_len = base::checked_cast<int>(url.size());
  TrimURL(url.data(), &begin, &url_len);

  // Find the scheme, if any.
  int num_slashes = CountConsecutiveSlashes(url.data(), begin, url_len);
  int after_scheme;
  size_t after_slashes;
  Parsed parsed;
#ifdef WIN32
  // See how many slashes there are. We want to handle cases like UNC but also
  // "/c:/foo". This is when there is no scheme, so we can allow pages to do
  // links like "c:/foo/bar" or "//foo/bar". This is also called by the
  // relative URL resolver when it determines there is an absolute URL, which
  // may give us input like "/c:/foo".
  after_slashes = begin + num_slashes;
  if (DoesBeginWindowsDriveSpec(url.data(), after_slashes, url_len)) {
    // Windows path, don't try to extract the scheme (for example, "c:\foo").
    after_scheme = after_slashes;
  } else if (DoesBeginUNCPath(url.data(), begin, url_len, false)) {
    // Windows UNC path: don't try to extract the scheme, but keep the slashes.
    after_scheme = begin;
  } else
#endif
  {
    // ExtractScheme doesn't understand the possibility of filenames with
    // colons in them, in which case it returns the entire spec up to the
    // colon as the scheme. So handle /foo.c:5 as a file but foo.c:5 as
    // the foo.c: scheme.
    if (!num_slashes &&
        ExtractScheme(&url[begin], url_len - begin, &parsed.scheme)) {
      // Offset the results since we gave ExtractScheme a substring.
      parsed.scheme.begin += begin;
      after_scheme = parsed.scheme.end() + 1;
    } else {
      // No scheme found, remember that.
      parsed.scheme.reset();
      after_scheme = begin;
    }
  }

  // Handle empty specs ones that contain only whitespace or control chars,
  // or that are just the scheme (for example "file:").
  if (after_scheme == url_len) {
    return parsed;
  }

  num_slashes = CountConsecutiveSlashes(url.data(), after_scheme, url_len);
  after_slashes = after_scheme + num_slashes;
#ifdef WIN32
  // Check whether the input is a drive again. We checked above for windows
  // drive specs, but that's only at the very beginning to see if we have a
  // scheme at all. This test will be duplicated in that case, but will
  // additionally handle all cases with a real scheme such as "file:///C:/".
  if (!DoesBeginWindowsDriveSpec(url.data(), after_slashes, url_len) &&
      num_slashes != 3) {
    // Anything not beginning with a drive spec ("c:\") on Windows is treated
    // as UNC, with the exception of three slashes which always means a file.
    // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
    DoParseUNC(url.substr(0, url_len), after_slashes, &parsed);
    return parsed;
  }
#else
  // file: URL with exactly 2 slashes is considered to have a host component.
  if (num_slashes == 2) {
    DoParseUNC(url.substr(0, url_len), after_slashes, &parsed);
    return parsed;
  }
#endif  // WIN32

  // Easy and common case, the full path immediately follows the scheme
  // (modulo slashes), as in "file://c:/foo". Just treat everything from
  // there to the end as the path. Empty hosts have 0 length instead of -1.
  // We include the last slash as part of the path if there is one.
  DoParseLocalFile(
      url.substr(0, url_len),
      num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme, &parsed);
  return parsed;
}

}  // namespace

Parsed ParseFileURL(std::string_view url) {
  return DoParseFileURL(url);
}

Parsed ParseFileURL(std::u16string_view url) {
  return DoParseFileURL(url);
}

}  // namespace url