fix(backend/sdoc_source_code): let lark scan CR to NEWLINE, not to NODE_STRING_VALUE

haxtibal · haxtibal · commit 066f5e25555c · 2025-11-25T21:24:25.000+01:00
strictdoc#2554 added dedent logic, #2555 added hash generation. that hash results for same input were different on Windows and Linux even if CR LF was normalized to LF. Further investigation revealed that dedent logic didn't work as expected on Windows, because CR LF got split half into the string token, and half into the newline token, which in turn confused the dedent logic. By trial it turned out that "/[^\n\r]+/x" (verbose regex mode, real newlines) is not the same as "/[^\\n\\r]+/" (normal regex mode, symbolized newlines). The former wrongly lets lark scan a \r into NODE_STRING_VALUE. Thus change to the latter pattern.
diff --git a/strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py b/strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py
@@ -24,7 +24,7 @@ class GrammarTemplate(Template):
 node_name: /##CUSTOM_TAGS/
 node_multiline_value: (_WS_INLINE? | (_WS_INLINE NODE_STRING_VALUE)) NEWLINE (NODE_STRING_VALUE NEWLINE)*
 
-NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*(##CUSTOM_TAGS):\\s)(?!\\s*##NODE_FIELD_END_MARKER)[^\n\r]+/x
+NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*(##CUSTOM_TAGS):\\s)(?!\\s*##NODE_FIELD_END_MARKER)[^\\n\\r]+/
 
 _NORMAL_STRING_NO_MARKER_NO_NODE: /(?!\\s*##RELATION_MARKER_START)((?!\\s*(##CUSTOM_TAGS):\\s)|(##RESERVED_KEYWORDS)).+/
 """)
diff --git a/tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py b/tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py
@@ -4,7 +4,7 @@
 
 from typing import Any, List, Optional
 
-from lark import Tree
+from lark import Token, Tree
 
 from strictdoc.backend.sdoc_source_code.comment_parser.marker_lexer import (
     MarkerLexer,
@@ -489,6 +489,46 @@ def test_34_node_text_starting_below() -> None:
     )
 
 
+def test_35a_node_value_newline_lf() -> None:
+    """Verify that LF goes into a separate NEWLINE token."""
+    input_string = "FIELD: value1\nvalue2\n"
+    tree = MarkerLexer.parse(input_string, custom_tags={"FIELD"})
+
+    node_fields = list(tree.find_data("node_field"))
+
+    assert_node_field(
+        node_fields[0],
+        "FIELD",
+        [
+            Token("NODE_STRING_VALUE", "value1"),
+            Token("NEWLINE", "\n"),
+            Token("NODE_STRING_VALUE", "value2"),
+            Token("NEWLINE", "\n"),
+        ],
+    )
+
+
+def test_35b_node_value_newline_crlf() -> None:
+    """Verify that CR LF goes into a separate NEWLINE token."""
+    input_string = "FIELD: value1\r\nvalue2\r\n"
+    tree = MarkerLexer.parse(input_string, custom_tags={"FIELD"})
+
+    node_fields = list(tree.find_data("node_field"))
+
+    assert_node_field(
+        node_fields[0],
+        "FIELD",
+        [
+            Token("NODE_STRING_VALUE", "value1"),
+            Token("NEWLINE", "\r\n"),
+            Token("NODE_STRING_VALUE", "value2"),
+            # The implicit \r\n => \n conversion at EOF is not nice, but doesn't hurt (yet).
+            # We need to improve EOF handling in lark grammar to get rid of it.
+            Token("NEWLINE", "\n"),
+        ],
+    )
+
+
 def test_60_exclude_reserved_keywords() -> None:
     input_string = """
         FIXME: This can likely replace _weak below with no problem.