1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
url / url_canon_non_special_url.cc [blame]
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Functions to canonicalize non-special URLs.
#include "url/url_canon.h"
#include "url/url_canon_internal.h"
namespace url {
namespace {
template <typename CHAR>
bool DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR>& source,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// The implementation is similar to `DoCanonicalizeStandardURL()`, but there
// are many subtle differences. So we have a different function for
// canonicalizing non-special URLs.
//
// Since canonicalization is also used from url::ReplaceComponents(),
// we have to handle an invalid URL replacement here, such as:
//
// > const url = "git:///";
// > url.username = "x";
// > url.href
// "git:///" (this should not be "git://x@").
DCHECK(!parsed.has_opaque_path);
// Scheme: this will append the colon.
bool success = CanonicalizeScheme(source.scheme, parsed.scheme, &output,
&new_parsed.scheme);
bool have_authority =
(parsed.username.is_valid() || parsed.password.is_valid() ||
parsed.host.is_valid() || parsed.port.is_valid());
// Non-special URL examples which should be carefully handled:
//
// | URL | parsed.user | parsed.host | have_authority | Valid URL? |
// |----------+---------------+---------------+----------------+------------|
// | git:/a | invalid | invalid | false | valid |
// | git://@/ | valid (empty) | invalid | true | invalid |
// | git:/// | invalid | valid (empty) | true | valid |
if (have_authority) {
// Only write the authority separators when we have a scheme.
if (parsed.scheme.is_valid()) {
output.push_back('/');
output.push_back('/');
}
// Username and Password
//
// URL Standard:
// - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
// - https://url.spec.whatwg.org/#dom-url-username
// - https://url.spec.whatwg.org/#dom-url-password
if (parsed.host.is_nonempty()) {
// User info: the canonicalizer will handle the : and @.
success &= CanonicalizeUserInfo(
source.username, parsed.username, source.password, parsed.password,
&output, &new_parsed.username, &new_parsed.password);
} else {
new_parsed.username.reset();
new_parsed.password.reset();
}
// Host
if (parsed.host.is_valid()) {
success &= CanonicalizeNonSpecialHost(source.host, parsed.host, output,
new_parsed.host);
} else {
new_parsed.host.reset();
// URL is invalid if `have_authority` is true, but `parsed.host` is
// invalid. Example: "git://@/".
success = false;
}
// Port
//
// URL Standard:
// - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
// - https://url.spec.whatwg.org/#dom-url-port
if (parsed.host.is_nonempty()) {
success &= CanonicalizePort(source.port, parsed.port, PORT_UNSPECIFIED,
&output, &new_parsed.port);
} else {
new_parsed.port.reset();
}
} else {
// No authority, clear the components.
new_parsed.host.reset();
new_parsed.username.reset();
new_parsed.password.reset();
new_parsed.port.reset();
}
// Path
if (parsed.path.is_valid()) {
if (!parsed.host.is_valid() && parsed.path.is_empty()) {
// Handle an edge case: Replacing non-special path-only URL's pathname
// with an empty path.
//
// Path-only non-special URLs cannot have their paths erased.
//
// Example:
//
// > const url = new URL("git:/a");
// > url.pathname = '';
// > url.href
// => The result should be "git:/", instead of "git:".
// > url.pathname
// => The result should be "/", instead of "".
//
// URL Standard is https://url.spec.whatwg.org/#dom-url-pathname, however,
// it would take some time to understand why url.pathname ends up as "/"
// in this case. Please read the URL Standard carefully to understand
// that.
new_parsed.path.begin = output.length();
output.push_back('/');
new_parsed.path.len = output.length() - new_parsed.path.begin;
} else {
success &=
CanonicalizePath(source.path, parsed.path, CanonMode::kNonSpecialURL,
&output, &new_parsed.path);
if (!parsed.host.is_valid() && new_parsed.path.is_valid() &&
new_parsed.path.as_string_view_on(output.view().data())
.starts_with("//")) {
// To avoid path being treated as the host, prepend "/." to the path".
//
// Examples:
//
// > const url = new URL("git:/.//a");
// > url.href
// => The result should be "git:/.//a", instead of "git://a".
//
// > const url = new URL("git:/");
// > url.pathname = "/.//a"
// > url.href
// => The result should be "git:/.//a", instead of "git://a".
//
// URL Standard: https://url.spec.whatwg.org/#concept-url-serializer
//
// > 3. If url’s host is null, url does not have an opaque path, url’s
// > path’s size is greater than 1, and url’s path[0] is the empty
// > string, then append U+002F (/) followed by U+002E (.) to output.
//
// Since the path length is unknown in advance, we post-process the new
// path here. This case is likely to be infrequent, so the performance
// impact should be minimal.
size_t prior_output_length = output.length();
output.Insert(new_parsed.path.begin, "/.");
// Adjust path.
new_parsed.path.begin += output.length() - prior_output_length;
}
}
} else {
new_parsed.path.reset();
}
// Query
CanonicalizeQuery(source.query, parsed.query, query_converter, &output,
&new_parsed.query);
// Ref: ignore failure for this, since the page can probably still be loaded.
CanonicalizeRef(source.ref, parsed.ref, &output, &new_parsed.ref);
// Carry over the flag for potentially dangling markup:
if (parsed.potentially_dangling_markup) {
new_parsed.potentially_dangling_markup = true;
}
return success;
}
} // namespace
bool CanonicalizeNonSpecialURL(const char* spec,
int spec_len,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// Carry over the flag.
new_parsed.has_opaque_path = parsed.has_opaque_path;
if (parsed.has_opaque_path) {
return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
}
return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
query_converter, output, new_parsed);
}
bool CanonicalizeNonSpecialURL(const char16_t* spec,
int spec_len,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// Carry over the flag.
new_parsed.has_opaque_path = parsed.has_opaque_path;
if (parsed.has_opaque_path) {
return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
}
return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
query_converter, output, new_parsed);
}
bool ReplaceNonSpecialURL(const char* base,
const Parsed& base_parsed,
const Replacements<char>& replacements,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// Carry over the flag.
new_parsed.has_opaque_path = base_parsed.has_opaque_path;
if (base_parsed.has_opaque_path) {
return ReplacePathURL(base, base_parsed, replacements, &output,
&new_parsed);
}
URLComponentSource<char> source(base);
Parsed parsed(base_parsed);
SetupOverrideComponents(base, replacements, &source, &parsed);
return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
new_parsed);
}
// For 16-bit replacements, we turn all the replacements into UTF-8 so the
// regular code path can be used.
bool ReplaceNonSpecialURL(const char* base,
const Parsed& base_parsed,
const Replacements<char16_t>& replacements,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// Carry over the flag.
new_parsed.has_opaque_path = base_parsed.has_opaque_path;
if (base_parsed.has_opaque_path) {
return ReplacePathURL(base, base_parsed, replacements, &output,
&new_parsed);
}
RawCanonOutput<1024> utf8;
URLComponentSource<char> source(base);
Parsed parsed(base_parsed);
SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
new_parsed);
}
} // namespace url