Skip to content

Commit 066f5e2

Browse files
committed
fix(backend/sdoc_source_code): let lark scan CR to NEWLINE, not to NODE_STRING_VALUE
strictdoc#2554 added dedent logic, #2555 added hash generation. that hash results for same input were different on Windows and Linux even if CR LF was normalized to LF. Further investigation revealed that dedent logic didn't work as expected on Windows, because CR LF got split half into the string token, and half into the newline token, which in turn confused the dedent logic. By trial it turned out that "/[^\n\r]+/x" (verbose regex mode, real newlines) is not the same as "/[^\\n\\r]+/" (normal regex mode, symbolized newlines). The former wrongly lets lark scan a \r into NODE_STRING_VALUE. Thus change to the latter pattern.
1 parent 2d2f4e9 commit 066f5e2

File tree

2 files changed

+42
-2
lines changed

2 files changed

+42
-2
lines changed

strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class GrammarTemplate(Template):
2424
node_name: /##CUSTOM_TAGS/
2525
node_multiline_value: (_WS_INLINE? | (_WS_INLINE NODE_STRING_VALUE)) NEWLINE (NODE_STRING_VALUE NEWLINE)*
2626
27-
NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*(##CUSTOM_TAGS):\\s)(?!\\s*##NODE_FIELD_END_MARKER)[^\n\r]+/x
27+
NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*(##CUSTOM_TAGS):\\s)(?!\\s*##NODE_FIELD_END_MARKER)[^\\n\\r]+/
2828
2929
_NORMAL_STRING_NO_MARKER_NO_NODE: /(?!\\s*##RELATION_MARKER_START)((?!\\s*(##CUSTOM_TAGS):\\s)|(##RESERVED_KEYWORDS)).+/
3030
""")

tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from typing import Any, List, Optional
66

7-
from lark import Tree
7+
from lark import Token, Tree
88

99
from strictdoc.backend.sdoc_source_code.comment_parser.marker_lexer import (
1010
MarkerLexer,
@@ -489,6 +489,46 @@ def test_34_node_text_starting_below() -> None:
489489
)
490490

491491

492+
def test_35a_node_value_newline_lf() -> None:
493+
"""Verify that LF goes into a separate NEWLINE token."""
494+
input_string = "FIELD: value1\nvalue2\n"
495+
tree = MarkerLexer.parse(input_string, custom_tags={"FIELD"})
496+
497+
node_fields = list(tree.find_data("node_field"))
498+
499+
assert_node_field(
500+
node_fields[0],
501+
"FIELD",
502+
[
503+
Token("NODE_STRING_VALUE", "value1"),
504+
Token("NEWLINE", "\n"),
505+
Token("NODE_STRING_VALUE", "value2"),
506+
Token("NEWLINE", "\n"),
507+
],
508+
)
509+
510+
511+
def test_35b_node_value_newline_crlf() -> None:
512+
"""Verify that CR LF goes into a separate NEWLINE token."""
513+
input_string = "FIELD: value1\r\nvalue2\r\n"
514+
tree = MarkerLexer.parse(input_string, custom_tags={"FIELD"})
515+
516+
node_fields = list(tree.find_data("node_field"))
517+
518+
assert_node_field(
519+
node_fields[0],
520+
"FIELD",
521+
[
522+
Token("NODE_STRING_VALUE", "value1"),
523+
Token("NEWLINE", "\r\n"),
524+
Token("NODE_STRING_VALUE", "value2"),
525+
# The implicit \r\n => \n conversion at EOF is not nice, but doesn't hurt (yet).
526+
# We need to improve EOF handling in lark grammar to get rid of it.
527+
Token("NEWLINE", "\n"),
528+
],
529+
)
530+
531+
492532
def test_60_exclude_reserved_keywords() -> None:
493533
input_string = """
494534
FIXME: This can likely replace _weak below with no problem.

0 commit comments

Comments
 (0)