22#include " ada/url_pattern_helpers-inl.h"
33
44#include < algorithm>
5+ #include < charconv>
56#include < optional>
7+ #include < ranges>
68#include < string>
79
10+ #include " ada/character_sets.h"
11+ #include " ada/helpers.h"
12+ #include " ada/scheme.h"
13+ #include " ada/unicode.h"
14+
815namespace ada ::url_pattern_helpers {
916
1017std::tuple<std::string, std::vector<std::string>>
@@ -224,15 +231,16 @@ tl::expected<std::string, errors> canonicalize_username(
224231 if (input.empty ()) [[unlikely]] {
225232 return " " ;
226233 }
227- // Let dummyURL be a new URL record .
228- auto url = ada::parse<url_aggregator>( " fake://dummy.test " , nullptr );
229- ADA_ASSERT_TRUE (url. has_value () );
230- // Set the username given dummyURL and value.
231- if (!url-> set_username ( input)) {
232- return tl::unexpected (errors::type_error );
234+ // Percent-encode the input using the userinfo percent-encode set .
235+ size_t idx = ada::unicode::percent_encode_index (
236+ input, character_sets::USERINFO_PERCENT_ENCODE );
237+ if (idx == input. size ()) {
238+ // No encoding needed, return input as-is
239+ return std::string (input );
233240 }
234- // Return dummyURL's username.
235- return std::string (url->get_username ());
241+ // Percent-encode from the first character that needs encoding
242+ return ada::unicode::percent_encode (
243+ input, character_sets::USERINFO_PERCENT_ENCODE, idx);
236244}
237245
238246tl::expected<std::string, errors> canonicalize_password (
@@ -241,16 +249,16 @@ tl::expected<std::string, errors> canonicalize_password(
241249 if (input.empty ()) [[unlikely]] {
242250 return " " ;
243251 }
244- // Let dummyURL be a new URL record.
245- // Set the password given dummyURL and value.
246- auto url = ada::parse<url_aggregator>(" fake://dummy.test" , nullptr );
247-
248- ADA_ASSERT_TRUE (url.has_value ());
249- if (!url->set_password (input)) {
250- return tl::unexpected (errors::type_error);
252+ // Percent-encode the input using the userinfo percent-encode set.
253+ size_t idx = ada::unicode::percent_encode_index (
254+ input, character_sets::USERINFO_PERCENT_ENCODE);
255+ if (idx == input.size ()) {
256+ // No encoding needed, return input as-is
257+ return std::string (input);
251258 }
252- // Return dummyURL's password.
253- return std::string (url->get_password ());
259+ // Percent-encode from the first character that needs encoding
260+ return ada::unicode::percent_encode (
261+ input, character_sets::USERINFO_PERCENT_ENCODE, idx);
254262}
255263
256264tl::expected<std::string, errors> canonicalize_hostname (
@@ -300,17 +308,41 @@ tl::expected<std::string, errors> canonicalize_port(
300308 if (port_value.empty ()) [[unlikely]] {
301309 return " " ;
302310 }
303- // Let dummyURL be a new URL record.
304- // If protocolValue was given, then set dummyURL's scheme to protocolValue.
305- // Let parseResult be the result of running basic URL parser given portValue
306- // with dummyURL as url and port state as state override.
307- auto url = ada::parse<url_aggregator>(" fake://dummy.test" , nullptr );
308- ADA_ASSERT_TRUE (url);
309- if (url->set_port (port_value)) {
310- // Return dummyURL's port, serialized, or empty string if it is null.
311- return std::string (url->get_port ());
311+
312+ // Remove ASCII tab or newline characters
313+ std::string trimmed (port_value);
314+ helpers::remove_ascii_tab_or_newline (trimmed);
315+
316+ if (trimmed.empty ()) {
317+ return " " ;
312318 }
313- // If parseResult is failure, then throw a TypeError.
319+
320+ // Input should start with a digit character
321+ if (!unicode::is_ascii_digit (trimmed.front ())) {
322+ return tl::unexpected (errors::type_error);
323+ }
324+
325+ // Find the first non-digit character
326+ auto first_non_digit =
327+ std::ranges::find_if_not (trimmed, unicode::is_ascii_digit);
328+ std::string_view digits_to_parse =
329+ std::string_view (trimmed.data (), first_non_digit - trimmed.begin ());
330+
331+ // Parse the port number
332+ uint16_t parsed_port{};
333+ auto result = std::from_chars (digits_to_parse.data (),
334+ digits_to_parse.data () + digits_to_parse.size (),
335+ parsed_port);
336+
337+ if (result.ec == std::errc::result_out_of_range) {
338+ return tl::unexpected (errors::type_error);
339+ }
340+
341+ if (result.ec == std::errc ()) {
342+ // Successfully parsed, return as string
343+ return std::to_string (parsed_port);
344+ }
345+
314346 return tl::unexpected (errors::type_error);
315347}
316348
@@ -321,34 +353,55 @@ tl::expected<std::string, errors> canonicalize_port_with_protocol(
321353 return " " ;
322354 }
323355
324- // TODO: Remove this
325- // We have an empty protocol because get_protocol() returns an empty string
326- // We should handle this in the caller rather than here.
356+ // Handle empty or trailing colon in protocol
327357 if (protocol.empty ()) {
328358 protocol = " fake" ;
329359 } else if (protocol.ends_with (" :" )) {
330360 protocol.remove_suffix (1 );
331361 }
332- // Let dummyURL be a new URL record.
333- // If protocolValue was given, then set dummyURL's scheme to protocolValue.
334- // Let parseResult be the result of running basic URL parser given portValue
335- // with dummyURL as url and port state as state override.
336- auto url = ada::parse<url_aggregator>(std::string (protocol) + " ://dummy.test" ,
337- nullptr );
338- // TODO: Remove has_port() check.
339- // This is actually a bug with url parser where set_port() returns true for
340- // "invalid80" port value.
341- if (url && url->set_port (port_value) && url->has_port ()) {
342- // Return dummyURL's port, serialized, or empty string if it is null.
343- return std::string (url->get_port ());
344- }
345- // TODO: Remove this once the previous has_port() check is removed.
346- if (url) {
347- if (scheme::is_special (protocol) && url->get_port ().empty ()) {
362+
363+ // Remove ASCII tab or newline characters
364+ std::string trimmed (port_value);
365+ helpers::remove_ascii_tab_or_newline (trimmed);
366+
367+ if (trimmed.empty ()) {
368+ return " " ;
369+ }
370+
371+ // Input should start with a digit character
372+ if (!unicode::is_ascii_digit (trimmed.front ())) {
373+ return tl::unexpected (errors::type_error);
374+ }
375+
376+ // Find the first non-digit character
377+ auto first_non_digit =
378+ std::ranges::find_if_not (trimmed, unicode::is_ascii_digit);
379+ std::string_view digits_to_parse =
380+ std::string_view (trimmed.data (), first_non_digit - trimmed.begin ());
381+
382+ // Parse the port number
383+ uint16_t parsed_port{};
384+ auto result = std::from_chars (digits_to_parse.data (),
385+ digits_to_parse.data () + digits_to_parse.size (),
386+ parsed_port);
387+
388+ if (result.ec == std::errc::result_out_of_range) {
389+ return tl::unexpected (errors::type_error);
390+ }
391+
392+ if (result.ec == std::errc ()) {
393+ // Check if this is the default port for the scheme
394+ uint16_t default_port = scheme::get_special_port (protocol);
395+
396+ // If it's the default port for a special scheme, return empty string
397+ if (default_port != 0 && default_port == parsed_port) {
348398 return " " ;
349399 }
400+
401+ // Successfully parsed, return as string
402+ return std::to_string (parsed_port);
350403 }
351- // If parseResult is failure, then throw a TypeError.
404+
352405 return tl::unexpected (errors::type_error);
353406}
354407
@@ -401,44 +454,55 @@ tl::expected<std::string, errors> canonicalize_search(std::string_view input) {
401454 if (input.empty ()) [[unlikely]] {
402455 return " " ;
403456 }
404- // Let dummyURL be a new URL record.
405- // Set dummyURL's query to the empty string.
406- // Let parseResult be the result of running basic URL parser given value with
407- // dummyURL as url and query state as state override.
408- auto url = ada::parse<url_aggregator>(" fake://dummy.test" , nullptr );
409- ADA_ASSERT_TRUE (url.has_value ());
410- url->set_search (input);
411- if (url->has_search ()) {
412- const auto search = url->get_search ();
413- if (!search.empty ()) {
414- return std::string (search.substr (1 ));
415- }
457+ // Remove leading '?' if present
458+ std::string new_value;
459+ new_value = input[0 ] == ' ?' ? input.substr (1 ) : input;
460+ // Remove ASCII tab or newline characters
461+ helpers::remove_ascii_tab_or_newline (new_value);
462+
463+ if (new_value.empty ()) {
416464 return " " ;
417465 }
418- return tl::unexpected (errors::type_error);
466+
467+ // Percent-encode using QUERY_PERCENT_ENCODE (for non-special URLs)
468+ // Note: "fake://dummy.test" is not a special URL, so we use
469+ // QUERY_PERCENT_ENCODE
470+ size_t idx = ada::unicode::percent_encode_index (
471+ new_value, character_sets::QUERY_PERCENT_ENCODE);
472+ if (idx == new_value.size ()) {
473+ // No encoding needed
474+ return new_value;
475+ }
476+ // Percent-encode from the first character that needs encoding
477+ return ada::unicode::percent_encode (
478+ new_value, character_sets::QUERY_PERCENT_ENCODE, idx);
419479}
420480
421481tl::expected<std::string, errors> canonicalize_hash (std::string_view input) {
422482 // If value is the empty string, return value.
423483 if (input.empty ()) [[unlikely]] {
424484 return " " ;
425485 }
426- // Let dummyURL be a new URL record.
427- // Set dummyURL's fragment to the empty string.
428- // Let parseResult be the result of running basic URL parser given value with
429- // dummyURL as url and fragment state as state override.
430- auto url = ada::parse<url_aggregator>(" fake://dummy.test" , nullptr );
431- ADA_ASSERT_TRUE (url.has_value ());
432- url->set_hash (input);
433- // Return dummyURL's fragment.
434- if (url->has_hash ()) {
435- const auto hash = url->get_hash ();
436- if (!hash.empty ()) {
437- return std::string (hash.substr (1 ));
438- }
486+ // Remove leading '#' if present
487+ std::string new_value;
488+ new_value = input[0 ] == ' #' ? input.substr (1 ) : input;
489+ // Remove ASCII tab or newline characters
490+ helpers::remove_ascii_tab_or_newline (new_value);
491+
492+ if (new_value.empty ()) {
439493 return " " ;
440494 }
441- return tl::unexpected (errors::type_error);
495+
496+ // Percent-encode using FRAGMENT_PERCENT_ENCODE
497+ size_t idx = ada::unicode::percent_encode_index (
498+ new_value, character_sets::FRAGMENT_PERCENT_ENCODE);
499+ if (idx == new_value.size ()) {
500+ // No encoding needed
501+ return new_value;
502+ }
503+ // Percent-encode from the first character that needs encoding
504+ return ada::unicode::percent_encode (
505+ new_value, character_sets::FRAGMENT_PERCENT_ENCODE, idx);
442506}
443507
444508tl::expected<std::vector<token>, errors> tokenize (std::string_view input,
0 commit comments