Skip to content

Commit 792e6ed

Browse files
committed
optimize canonicalize methods for url_pattern
1 parent 31b1f28 commit 792e6ed

File tree

1 file changed

+138
-74
lines changed

1 file changed

+138
-74
lines changed

src/url_pattern_helpers.cpp

Lines changed: 138 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,16 @@
22
#include "ada/url_pattern_helpers-inl.h"
33

44
#include <algorithm>
5+
#include <charconv>
56
#include <optional>
7+
#include <ranges>
68
#include <string>
79

10+
#include "ada/character_sets.h"
11+
#include "ada/helpers.h"
12+
#include "ada/scheme.h"
13+
#include "ada/unicode.h"
14+
815
namespace ada::url_pattern_helpers {
916

1017
std::tuple<std::string, std::vector<std::string>>
@@ -224,15 +231,16 @@ tl::expected<std::string, errors> canonicalize_username(
224231
if (input.empty()) [[unlikely]] {
225232
return "";
226233
}
227-
// Let dummyURL be a new URL record.
228-
auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
229-
ADA_ASSERT_TRUE(url.has_value());
230-
// Set the username given dummyURL and value.
231-
if (!url->set_username(input)) {
232-
return tl::unexpected(errors::type_error);
234+
// Percent-encode the input using the userinfo percent-encode set.
235+
size_t idx = ada::unicode::percent_encode_index(
236+
input, character_sets::USERINFO_PERCENT_ENCODE);
237+
if (idx == input.size()) {
238+
// No encoding needed, return input as-is
239+
return std::string(input);
233240
}
234-
// Return dummyURL's username.
235-
return std::string(url->get_username());
241+
// Percent-encode from the first character that needs encoding
242+
return ada::unicode::percent_encode(
243+
input, character_sets::USERINFO_PERCENT_ENCODE, idx);
236244
}
237245

238246
tl::expected<std::string, errors> canonicalize_password(
@@ -241,16 +249,16 @@ tl::expected<std::string, errors> canonicalize_password(
241249
if (input.empty()) [[unlikely]] {
242250
return "";
243251
}
244-
// Let dummyURL be a new URL record.
245-
// Set the password given dummyURL and value.
246-
auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
247-
248-
ADA_ASSERT_TRUE(url.has_value());
249-
if (!url->set_password(input)) {
250-
return tl::unexpected(errors::type_error);
252+
// Percent-encode the input using the userinfo percent-encode set.
253+
size_t idx = ada::unicode::percent_encode_index(
254+
input, character_sets::USERINFO_PERCENT_ENCODE);
255+
if (idx == input.size()) {
256+
// No encoding needed, return input as-is
257+
return std::string(input);
251258
}
252-
// Return dummyURL's password.
253-
return std::string(url->get_password());
259+
// Percent-encode from the first character that needs encoding
260+
return ada::unicode::percent_encode(
261+
input, character_sets::USERINFO_PERCENT_ENCODE, idx);
254262
}
255263

256264
tl::expected<std::string, errors> canonicalize_hostname(
@@ -300,17 +308,41 @@ tl::expected<std::string, errors> canonicalize_port(
300308
if (port_value.empty()) [[unlikely]] {
301309
return "";
302310
}
303-
// Let dummyURL be a new URL record.
304-
// If protocolValue was given, then set dummyURL's scheme to protocolValue.
305-
// Let parseResult be the result of running basic URL parser given portValue
306-
// with dummyURL as url and port state as state override.
307-
auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
308-
ADA_ASSERT_TRUE(url);
309-
if (url->set_port(port_value)) {
310-
// Return dummyURL's port, serialized, or empty string if it is null.
311-
return std::string(url->get_port());
311+
312+
// Remove ASCII tab or newline characters
313+
std::string trimmed(port_value);
314+
helpers::remove_ascii_tab_or_newline(trimmed);
315+
316+
if (trimmed.empty()) {
317+
return "";
312318
}
313-
// If parseResult is failure, then throw a TypeError.
319+
320+
// Input should start with a digit character
321+
if (!unicode::is_ascii_digit(trimmed.front())) {
322+
return tl::unexpected(errors::type_error);
323+
}
324+
325+
// Find the first non-digit character
326+
auto first_non_digit =
327+
std::ranges::find_if_not(trimmed, unicode::is_ascii_digit);
328+
std::string_view digits_to_parse =
329+
std::string_view(trimmed.data(), first_non_digit - trimmed.begin());
330+
331+
// Parse the port number
332+
uint16_t parsed_port{};
333+
auto result = std::from_chars(digits_to_parse.data(),
334+
digits_to_parse.data() + digits_to_parse.size(),
335+
parsed_port);
336+
337+
if (result.ec == std::errc::result_out_of_range) {
338+
return tl::unexpected(errors::type_error);
339+
}
340+
341+
if (result.ec == std::errc()) {
342+
// Successfully parsed, return as string
343+
return std::to_string(parsed_port);
344+
}
345+
314346
return tl::unexpected(errors::type_error);
315347
}
316348

@@ -321,34 +353,55 @@ tl::expected<std::string, errors> canonicalize_port_with_protocol(
321353
return "";
322354
}
323355

324-
// TODO: Remove this
325-
// We have an empty protocol because get_protocol() returns an empty string
326-
// We should handle this in the caller rather than here.
356+
// Handle empty or trailing colon in protocol
327357
if (protocol.empty()) {
328358
protocol = "fake";
329359
} else if (protocol.ends_with(":")) {
330360
protocol.remove_suffix(1);
331361
}
332-
// Let dummyURL be a new URL record.
333-
// If protocolValue was given, then set dummyURL's scheme to protocolValue.
334-
// Let parseResult be the result of running basic URL parser given portValue
335-
// with dummyURL as url and port state as state override.
336-
auto url = ada::parse<url_aggregator>(std::string(protocol) + "://dummy.test",
337-
nullptr);
338-
// TODO: Remove has_port() check.
339-
// This is actually a bug with url parser where set_port() returns true for
340-
// "invalid80" port value.
341-
if (url && url->set_port(port_value) && url->has_port()) {
342-
// Return dummyURL's port, serialized, or empty string if it is null.
343-
return std::string(url->get_port());
344-
}
345-
// TODO: Remove this once the previous has_port() check is removed.
346-
if (url) {
347-
if (scheme::is_special(protocol) && url->get_port().empty()) {
362+
363+
// Remove ASCII tab or newline characters
364+
std::string trimmed(port_value);
365+
helpers::remove_ascii_tab_or_newline(trimmed);
366+
367+
if (trimmed.empty()) {
368+
return "";
369+
}
370+
371+
// Input should start with a digit character
372+
if (!unicode::is_ascii_digit(trimmed.front())) {
373+
return tl::unexpected(errors::type_error);
374+
}
375+
376+
// Find the first non-digit character
377+
auto first_non_digit =
378+
std::ranges::find_if_not(trimmed, unicode::is_ascii_digit);
379+
std::string_view digits_to_parse =
380+
std::string_view(trimmed.data(), first_non_digit - trimmed.begin());
381+
382+
// Parse the port number
383+
uint16_t parsed_port{};
384+
auto result = std::from_chars(digits_to_parse.data(),
385+
digits_to_parse.data() + digits_to_parse.size(),
386+
parsed_port);
387+
388+
if (result.ec == std::errc::result_out_of_range) {
389+
return tl::unexpected(errors::type_error);
390+
}
391+
392+
if (result.ec == std::errc()) {
393+
// Check if this is the default port for the scheme
394+
uint16_t default_port = scheme::get_special_port(protocol);
395+
396+
// If it's the default port for a special scheme, return empty string
397+
if (default_port != 0 && default_port == parsed_port) {
348398
return "";
349399
}
400+
401+
// Successfully parsed, return as string
402+
return std::to_string(parsed_port);
350403
}
351-
// If parseResult is failure, then throw a TypeError.
404+
352405
return tl::unexpected(errors::type_error);
353406
}
354407

@@ -401,44 +454,55 @@ tl::expected<std::string, errors> canonicalize_search(std::string_view input) {
401454
if (input.empty()) [[unlikely]] {
402455
return "";
403456
}
404-
// Let dummyURL be a new URL record.
405-
// Set dummyURL's query to the empty string.
406-
// Let parseResult be the result of running basic URL parser given value with
407-
// dummyURL as url and query state as state override.
408-
auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
409-
ADA_ASSERT_TRUE(url.has_value());
410-
url->set_search(input);
411-
if (url->has_search()) {
412-
const auto search = url->get_search();
413-
if (!search.empty()) {
414-
return std::string(search.substr(1));
415-
}
457+
// Remove leading '?' if present
458+
std::string new_value;
459+
new_value = input[0] == '?' ? input.substr(1) : input;
460+
// Remove ASCII tab or newline characters
461+
helpers::remove_ascii_tab_or_newline(new_value);
462+
463+
if (new_value.empty()) {
416464
return "";
417465
}
418-
return tl::unexpected(errors::type_error);
466+
467+
// Percent-encode using QUERY_PERCENT_ENCODE (for non-special URLs)
468+
// Note: "fake://dummy.test" is not a special URL, so we use
469+
// QUERY_PERCENT_ENCODE
470+
size_t idx = ada::unicode::percent_encode_index(
471+
new_value, character_sets::QUERY_PERCENT_ENCODE);
472+
if (idx == new_value.size()) {
473+
// No encoding needed
474+
return new_value;
475+
}
476+
// Percent-encode from the first character that needs encoding
477+
return ada::unicode::percent_encode(
478+
new_value, character_sets::QUERY_PERCENT_ENCODE, idx);
419479
}
420480

421481
tl::expected<std::string, errors> canonicalize_hash(std::string_view input) {
422482
// If value is the empty string, return value.
423483
if (input.empty()) [[unlikely]] {
424484
return "";
425485
}
426-
// Let dummyURL be a new URL record.
427-
// Set dummyURL's fragment to the empty string.
428-
// Let parseResult be the result of running basic URL parser given value with
429-
// dummyURL as url and fragment state as state override.
430-
auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
431-
ADA_ASSERT_TRUE(url.has_value());
432-
url->set_hash(input);
433-
// Return dummyURL's fragment.
434-
if (url->has_hash()) {
435-
const auto hash = url->get_hash();
436-
if (!hash.empty()) {
437-
return std::string(hash.substr(1));
438-
}
486+
// Remove leading '#' if present
487+
std::string new_value;
488+
new_value = input[0] == '#' ? input.substr(1) : input;
489+
// Remove ASCII tab or newline characters
490+
helpers::remove_ascii_tab_or_newline(new_value);
491+
492+
if (new_value.empty()) {
439493
return "";
440494
}
441-
return tl::unexpected(errors::type_error);
495+
496+
// Percent-encode using FRAGMENT_PERCENT_ENCODE
497+
size_t idx = ada::unicode::percent_encode_index(
498+
new_value, character_sets::FRAGMENT_PERCENT_ENCODE);
499+
if (idx == new_value.size()) {
500+
// No encoding needed
501+
return new_value;
502+
}
503+
// Percent-encode from the first character that needs encoding
504+
return ada::unicode::percent_encode(
505+
new_value, character_sets::FRAGMENT_PERCENT_ENCODE, idx);
442506
}
443507

444508
tl::expected<std::vector<token>, errors> tokenize(std::string_view input,

0 commit comments

Comments
 (0)