Skip to content

Commit 5967d62

Browse files
committed
start implementing tokenizer & tokenize
1 parent b08176a commit 5967d62

File tree

3 files changed

+191
-38
lines changed

3 files changed

+191
-38
lines changed

include/ada/url_pattern.h

Lines changed: 131 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,43 +11,6 @@
1111

1212
namespace ada {
1313

14-
namespace url_pattern {
15-
16-
enum class errors { type_error };
17-
18-
// @see https://urlpattern.spec.whatwg.org/#canonicalize-a-protocol
19-
std::optional<std::string> canonicalize_protocol(std::string_view input);
20-
21-
// @see https://wicg.github.io/urlpattern/#canonicalize-a-username
22-
std::optional<std::string> canonicalize_username(std::string_view input);
23-
24-
// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
25-
std::optional<std::string> canonicalize_password(std::string_view input);
26-
27-
// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
28-
std::optional<std::string> canonicalize_hostname(std::string_view input);
29-
30-
// @see https://wicg.github.io/urlpattern/#canonicalize-an-ipv6-hostname
31-
std::optional<std::string> canonicalize_ipv6_hostname(std::string_view input);
32-
33-
// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
34-
std::optional<std::string> canonicalize_port(
35-
std::string_view input, std::string_view protocol = "fake");
36-
37-
// @see https://wicg.github.io/urlpattern/#canonicalize-a-pathname
38-
std::optional<std::string> canonicalize_pathname(std::string_view input);
39-
40-
// @see https://wicg.github.io/urlpattern/#canonicalize-an-opaque-pathname
41-
std::optional<std::string> canonicalize_opaque_pathname(std::string_view input);
42-
43-
// @see https://wicg.github.io/urlpattern/#canonicalize-a-search
44-
std::optional<std::string> canonicalize_search(std::string_view input);
45-
46-
// @see https://wicg.github.io/urlpattern/#canonicalize-a-hash
47-
std::optional<std::string> canonicalize_hash(std::string_view input);
48-
49-
} // namespace url_pattern
50-
5114
// URLPattern is a Web Platform standard API for matching URLs against a
5215
// pattern syntax (think of it as a regular expression for URLs). It is
5316
// defined in https://wicg.github.io/urlpattern.
@@ -175,6 +138,137 @@ class URLPattern {
175138
bool ignore_case_ = false;
176139
};
177140

141+
namespace url_pattern {
142+
143+
enum class errors { type_error };
144+
145+
// @see https://urlpattern.spec.whatwg.org/#tokens
146+
struct Token {
147+
// @see https://urlpattern.spec.whatwg.org/#tokenize-policy
148+
enum Policy {
149+
STRICT,
150+
LENIENT,
151+
};
152+
153+
// @see https://urlpattern.spec.whatwg.org/#token
154+
enum Type {
155+
INVALID_CHAR, // 0
156+
OPEN, // 1
157+
CLOSE, // 2
158+
REGEXP, // 3
159+
NAME, // 4
160+
CHAR, // 5
161+
ESCAPED_CHAR, // 6
162+
OTHER_MODIFIER, // 7
163+
ASTERISK, // 8
164+
END, // 9
165+
};
166+
};
167+
168+
// @see https://urlpattern.spec.whatwg.org/#tokenizer
169+
struct Tokenizer {
170+
explicit Tokenizer(std::string_view input, Token::Policy policy)
171+
: input(input), policy(std::move(policy));
172+
173+
// has an associated input, a pattern string, initially the empty string.
174+
std::string input{};
175+
// has an associated policy, a tokenize policy, initially "strict".
176+
Token::Policy policy = Token::Policy::STRICT;
177+
// has an associated token list, a token list, initially an empty list.
178+
std::vector<Token> token_list{};
179+
// has an associated index, a number, initially 0.
180+
size_t index = 0;
181+
// has an associated next index, a number, initially 0.
182+
size_t next_index = 0;
183+
// has an associated code point, a Unicode code point, initially null.
184+
char* code_point = nullptr;
185+
};
186+
187+
// @see https://urlpattern.spec.whatwg.org/#constructor-string-parser
188+
struct ConstructorStringParser {
189+
explicit ConstructorStringParser(std::string_view input,
190+
std::vector<Token>& token_list);
191+
192+
private:
193+
// @see https://urlpattern.spec.whatwg.org/#constructor-string-parser-state
194+
enum State {
195+
INIT,
196+
PROTOCOL,
197+
AUTHORITY,
198+
PASSWORD,
199+
HOSTNAME,
200+
PORT,
201+
PATHNAME,
202+
SEARCH,
203+
HASH,
204+
DONE,
205+
};
206+
// has an associated input, a string, which must be set upon creation.
207+
std::string input;
208+
// has an associated token list, a token list, which must be set upon
209+
// creation.
210+
std::vector<Token> token_list;
211+
// has an associated result, a URLPatternInit, initially set to a new
212+
// URLPatternInit.
213+
URLPattern::Init result{};
214+
// has an associated component start, a number, initially set to 0.
215+
size_t component_start = 0;
216+
// has an associated token index, a number, initially set to 0.
217+
size_t token_index = 0;
218+
// has an associated token increment, a number, initially set to 1.
219+
size_t token_increment = 1;
220+
// has an associated group depth, a number, initially set to 0.
221+
size_t group_depth = 0;
222+
// has an associated hostname IPv6 bracket depth, a number, initially set to
223+
// 0.
224+
size_t hostname_ipv6_bracket_depth = 0;
225+
// has an associated protocol matches a special scheme flag, a boolean,
226+
// initially set to false.
227+
bool protocol_matches_a_special_scheme_flag = false;
228+
// has an associated state, a string, initially set to "init". It must be one
229+
// of the following:
230+
State state = INIT;
231+
};
232+
233+
// @see https://urlpattern.spec.whatwg.org/#canonicalize-a-protocol
234+
std::optional<std::string> canonicalize_protocol(std::string_view input);
235+
236+
// @see https://wicg.github.io/urlpattern/#canonicalize-a-username
237+
std::optional<std::string> canonicalize_username(std::string_view input);
238+
239+
// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
240+
std::optional<std::string> canonicalize_password(std::string_view input);
241+
242+
// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
243+
std::optional<std::string> canonicalize_hostname(std::string_view input);
244+
245+
// @see https://wicg.github.io/urlpattern/#canonicalize-an-ipv6-hostname
246+
std::optional<std::string> canonicalize_ipv6_hostname(std::string_view input);
247+
248+
// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
249+
std::optional<std::string> canonicalize_port(
250+
std::string_view input, std::string_view protocol = "fake");
251+
252+
// @see https://wicg.github.io/urlpattern/#canonicalize-a-pathname
253+
std::optional<std::string> canonicalize_pathname(std::string_view input);
254+
255+
// @see https://wicg.github.io/urlpattern/#canonicalize-an-opaque-pathname
256+
std::optional<std::string> canonicalize_opaque_pathname(std::string_view input);
257+
258+
// @see https://wicg.github.io/urlpattern/#canonicalize-a-search
259+
std::optional<std::string> canonicalize_search(std::string_view input);
260+
261+
// @see https://wicg.github.io/urlpattern/#canonicalize-a-hash
262+
std::optional<std::string> canonicalize_hash(std::string_view input);
263+
264+
// @see https://urlpattern.spec.whatwg.org/#parse-a-constructor-string
265+
URLPattern::Init parse_constructor_string(std::string_view input);
266+
267+
// @see https://urlpattern.spec.whatwg.org/#tokenize
268+
std::string tokenize(std::string_view input, Token::Policy policy);
269+
270+
} // namespace url_pattern
271+
178272
} // namespace ada
179273

180274
#endif

src/parser.cpp

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -907,7 +907,44 @@ result_type parse_url_impl(std::string_view user_input,
907907
tl::expected<ada::URLPattern, ada::url_pattern::errors> parse_url_pattern(
908908
std::variant<std::string_view, URLPattern::Init> input,
909909
const std::string_view* base_url, const ada::URLPattern::Options* options) {
910-
// TODO: Implement parser here.
910+
// Let init be null.
911+
URLPattern::Init init;
912+
913+
// If input is a scalar value string then:
914+
if (std::holds_alternative<std::string_view>(input)) {
915+
// Set init to the result of running parse a constructor string given input.
916+
init = url_pattern::parse_constructor_string(
917+
std::get<std::string_view>(input));
918+
919+
// If baseURL is null and init["protocol"] does not exist, then throw a
920+
// TypeError.
921+
if (base_url == nullptr && !init.protocol.has_value()) {
922+
return tl::unexpected(url_pattern::errors::type_error);
923+
}
924+
925+
// If baseURL is not null, set init["baseURL"] to baseURL.
926+
if (base_url != nullptr) {
927+
init.base_url = std::string(*base_url);
928+
}
929+
} else {
930+
// Assert: input is a URLPatternInit.
931+
ADA_ASSERT_TRUE(std::holds_alternative<URLPattern::Init>(input));
932+
// If baseURL is not null, then throw a TypeError.
933+
if (base_url == nullptr) {
934+
return tl::unexpected(url_pattern::errors::type_error);
935+
}
936+
// Optimization: Avoid copy by moving the input value.
937+
// Set init to input.
938+
init = std::move(std::get<URLPattern::Init>(input));
939+
}
940+
941+
// Let processedInit be the result of process a URLPatternInit given init,
942+
// "pattern", null, null, null, null, null, null, null, and null.
943+
// TODO: Implement this
944+
945+
// For each componentName of « "protocol", "username", "password", "hostname",
946+
// "port", "pathname", "search", "hash" If processedInit[componentName] does
947+
// not exist, then set processedInit[componentName] to "*".
911948
return tl::unexpected(url_pattern::errors::type_error);
912949
}
913950

src/url_pattern.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,28 @@ std::optional<std::string> canonicalize_hash(std::string_view input) {
192192
return std::string(hash.substr(1));
193193
}
194194

195+
URLPattern::Init parse_constructor_string(std::string_view input) {
196+
// Let parser be a new constructor string parser whose input is input and
197+
// token list is the result of running tokenize given input and "lenient".
198+
// TODO: Implement this
199+
return {};
200+
}
201+
202+
std::string tokenize(std::string_view input, Token::Policy policy) {
203+
// Let tokenizer be a new tokenizer.
204+
// Set tokenizer’s input to input.
205+
// Set tokenizer’s policy to policy.
206+
auto tokenizer = Tokenizer(input, policy);
207+
// While tokenizer’s index is less than tokenizer’s input's code point length:
208+
while (tokenizer.index < tokenizer.input.size()) {
209+
// Run seek and get the next code point given tokenizer and tokenizer’s
210+
// index.
211+
// TODO
212+
}
213+
// TODO: Implement this
214+
return "";
215+
}
216+
195217
} // namespace url_pattern
196218

197219
URLPattern::Component::Component(std::string_view pattern_,

0 commit comments

Comments
 (0)