fix: indentation issues in code formatter.

plusvic · plusvic · commit 6bfe2e2e0bea · 2025-09-15T13:28:01.000+02:00
This fixes multiple issues in the formatter while handling comments in code that uses tabs for indentation.

It introduces a new `--tab-size` argument to the `yr fmt` command that allows to specify the tab size used in the input code.
diff --git a/cli/src/commands/fmt.rs b/cli/src/commands/fmt.rs
@@ -7,7 +7,7 @@ use clap::{arg, value_parser, ArgAction, ArgMatches, Command};
 use yara_x_fmt::Formatter;
 
 use crate::config::Config;
-use crate::help::FMT_CHECK_MODE;
+use crate::help;
 
 pub fn fmt() -> Command {
     super::command("fmt")
@@ -19,14 +19,26 @@ pub fn fmt() -> Command {
                 .value_parser(value_parser!(PathBuf))
                 .action(ArgAction::Append),
         )
-        .arg(arg!(-c --check  "Run in 'check' mode").long_help(FMT_CHECK_MODE))
+        .arg(
+            arg!(-c --check  "Run in 'check' mode")
+                .long_help(help::FMT_CHECK_MODE),
+        )
+        .arg(
+            arg!(-t - -"tab-size" <NUM_SPACES>)
+                .help("Tab size (in spaces) used in source files")
+                .long_help(help::FMT_TAB_SIZE)
+                .default_value("4")
+                .value_parser(value_parser!(usize)),
+        )
 }
 
 pub fn exec_fmt(args: &ArgMatches, config: &Config) -> anyhow::Result<()> {
     let files = args.get_many::<PathBuf>("FILE").unwrap();
     let check = args.get_flag("check");
+    let tab_size = args.get_one::<usize>("tab-size").unwrap();
 
     let formatter = Formatter::new()
+        .input_tab_size(*tab_size)
         .align_metadata(config.fmt.meta.align_values)
         .align_patterns(config.fmt.patterns.align_values)
         .indent_section_headers(config.fmt.rule.indent_section_headers)
diff --git a/cli/src/help.rs b/cli/src/help.rs
@@ -86,6 +86,12 @@ pub const FMT_CHECK_MODE: &str = r#"Run in 'check' mode
 Doesn't modify the files. Exits with 0 if files are formatted correctly. Exits
 with 1 if formatting is required."#;
 
+pub const FMT_TAB_SIZE: &str = r#"Tab size (in spaces) used in source files
+
+If the input contains tab characters, the formatter uses this value to determine how
+many spaces each tab represents. Setting this incorrectly can lead to misaligned 
+formatting when the code mixes tabs and spaces."#;
+
 pub const FIX_ENCODING_LONG_HELP: &str = r#"Convert source files to UTF-8
 
 YARA-X is stricter that YARA with respect to invalid UTF-8 characters in source
diff --git a/fmt/src/comments.rs b/fmt/src/comments.rs
@@ -24,8 +24,9 @@ use crate::tokens::{Token, TokenStream};
 /// ```
 ///
 /// This processor must be used with a token stream that still retains the
-/// original spacing of the source code, because it needs the spacing for
-/// determining the original indentation of the comment. For example:
+/// original spacing of the source code (but with tabs replaced by spaces),
+/// because it needs the spacing for determining the original indentation
+/// of the comment. For example:
 ///
 /// ```text
 /// rule test {
@@ -73,6 +74,7 @@ where
     start_of_input: bool,
     end_of_input: bool,
     indentation: usize,
+    tab_size: usize,
 }
 
 /// States used in [`CommentProcessor::process_input_buffer`]
@@ -103,9 +105,18 @@ where
             start_of_input: true,
             end_of_input: false,
             indentation: 0,
+            tab_size: 4,
         }
     }
 
+    /// Number of spaces in a tab.
+    ///
+    /// The default is `4`.
+    pub fn tab_size(mut self, n: usize) -> Self {
+        self.tab_size = n;
+        self
+    }
+
     fn push_comment(
         &mut self,
         comment_lines: Vec<Vec<u8>>,
@@ -154,7 +165,11 @@ where
                 State::PreComment { leading_newline } => {
                     match self.input_buffer.pop_front() {
                         Some(token @ Token::Whitespace) => {
-                            self.indentation += token.len();
+                            self.indentation += 1;
+                            self.output_buffer.push_back(token);
+                        }
+                        Some(token @ Token::Tab) => {
+                            self.indentation += self.tab_size;
                             self.output_buffer.push_back(token);
                         }
                         // A newline has been found while in PreComment state,
@@ -174,6 +189,7 @@ where
                                 lines: split_comment_lines(
                                     comment,
                                     self.indentation,
+                                    self.tab_size,
                                 ),
                             };
                             self.indentation += token.len();
@@ -189,8 +205,11 @@ where
                     leading_newline,
                     indentation,
                 } => match self.input_buffer.pop_front() {
-                    Some(token @ Token::Whitespace) => {
-                        self.indentation += token.len();
+                    Some(Token::Whitespace) => {
+                        self.indentation += 1;
+                    }
+                    Some(Token::Tab) => {
+                        self.indentation += self.tab_size;
                     }
                     // Newline found while in the Comment state. If this is the
                     // first newline after the comment, the trailing_newline
@@ -241,8 +260,12 @@ where
                     Some(Token::Comment(comment)) => {
                         if *indentation == self.indentation {
                             lines.append(
-                                split_comment_lines(comment, *indentation)
-                                    .as_mut(),
+                                split_comment_lines(
+                                    comment,
+                                    *indentation,
+                                    self.tab_size,
+                                )
+                                .as_mut(),
                             );
                             *trailing_newline = false;
                         } else {
@@ -258,6 +281,7 @@ where
                                 lines: split_comment_lines(
                                     comment,
                                     self.indentation,
+                                    self.tab_size,
                                 ),
                             };
                         }
@@ -331,7 +355,7 @@ where
 /// Splits a multi-line comment into lines.
 ///
 /// Also removes the specified number of whitespaces from the beginning of
-/// each line, except the first one.
+/// each line.
 ///
 /// This is necessary because when a multi-line comment that uses the
 /// `/* comment */` syntax is indented, the comment itself contains some spaces
@@ -346,16 +370,31 @@ where
 /// Notice how the comment contains some spaces (here represented by
 /// `<-- indentation -->`) that should be removed/adjusted when the comment
 /// is re-indented.
-fn split_comment_lines(comment: &[u8], indentation: usize) -> Vec<Vec<u8>> {
+fn split_comment_lines(
+    comment: &[u8],
+    indentation: usize,
+    tab_size: usize,
+) -> Vec<Vec<u8>> {
     let comment = BStr::new(comment);
-    let indent = b" ".repeat(indentation);
     let mut result = Vec::new();
     for line in comment.lines() {
-        if let Some(line_no_indent) = line.strip_prefix(indent.as_slice()) {
-            result.push(line_no_indent.to_vec())
-        } else {
-            result.push(line.to_owned())
+        let mut i = 0;
+        let mut comment_start = 0;
+        for (start, _, ch) in line.char_indices() {
+            if i >= indentation {
+                comment_start = start;
+                break;
+            }
+            match ch {
+                ' ' => i += 1,
+                '\t' => i += tab_size,
+                _ => {
+                    comment_start = start;
+                    break;
+                }
+            }
         }
+        result.push(line.get(comment_start..).unwrap_or_default().to_vec());
     }
     result
 }
diff --git a/fmt/src/lib.rs b/fmt/src/lib.rs
@@ -71,6 +71,7 @@ pub struct Formatter {
     newline_before_curly_brace: bool,
     empty_line_before_section_header: bool,
     empty_line_after_section_header: bool,
+    tab_size: usize,
 }
 
 impl Default for Formatter {
@@ -92,6 +93,7 @@ impl Formatter {
             newline_before_curly_brace: false,
             empty_line_before_section_header: true,
             empty_line_after_section_header: false,
+            tab_size: 4,
         }
     }
 
@@ -239,6 +241,19 @@ impl Formatter {
         self
     }
 
+    /// Specifies the tab size (in spaces) expected in the unformatted source
+    /// code.
+    ///
+    /// If the input contains tab characters, the formatter uses this value to
+    /// determine how many spaces each tab represents. Setting this incorrectly
+    /// can lead to misaligned formatting when the code mixes tabs and spaces.
+    ///
+    /// Defaults to `4`.
+    pub fn input_tab_size(mut self, tab_size: usize) -> Self {
+        self.tab_size = tab_size;
+        self
+    }
+
     /// Specify if newline should be added before the opening curly brace in a
     /// rule declaration. If false the rule will look like this:
     ///
@@ -379,7 +394,8 @@ impl Formatter {
     where
         I: TokenStream<'a> + 'a,
     {
-        let tokens = comments::CommentProcessor::new(input);
+        let tokens =
+            comments::CommentProcessor::new(input).tab_size(self.tab_size);
 
         // Remove all whitespaces from the original source.
         let tokens = processor::Processor::new(tokens).add_rule(
diff --git a/fmt/src/testdata/default_tests/test35.formatted b/fmt/src/testdata/default_tests/test35.formatted
@@ -0,0 +1,21 @@
+rule test_1 {
+  condition:
+    /*
+            Test
+    */
+    uint16be(0) == 0x4d5a
+}
+
+rule test_2 {
+  condition:
+    /*
+        Test
+    */
+    /*
+        Test
+    */
+    /*
+        Test
+    */
+    uint16be(0) == 0x4d5a
+}
diff --git a/fmt/src/testdata/default_tests/test35.unformatted b/fmt/src/testdata/default_tests/test35.unformatted
@@ -0,0 +1,23 @@
+rule test_1
+{
+	condition:
+		/*
+		        Test
+		*/
+		uint16be(0) == 0x4d5a
+}
+
+rule test_2
+{
+	condition:
+		/*
+            Test
+        */
+        /*
+            Test
+        */
+        /*
+            Test
+        */
+		uint16be(0) == 0x4d5a
+}
diff --git a/fmt/src/tokens/mod.rs b/fmt/src/tokens/mod.rs
@@ -503,15 +503,22 @@ where
                     let token_bytes = &self.source[span.range()];
                     // The whitespace token has a different treatment because
                     // the parser returns a single whitespace token when
-                    // multiple whitespaces appear together. Here we separate
-                    // them into individual spaces.
+                    // multiple whitespaces appear together, and tabs are also
+                    // treated as whitespaces. Here we separate each whitespace
+                    // or tab into its own token.
                     return if kind == SyntaxKind::WHITESPACE {
                         // SAFETY: It's safe to assume that the whitespace
                         // token is composed of valid UTF-8 characters. The
                         // tokenizer guarantees this.
                         let s = unsafe { from_utf8_unchecked(token_bytes) };
-                        for _ in s.chars() {
-                            self.buffer.push_back(Token::Whitespace);
+                        for ch in s.chars() {
+                            match ch {
+                                ' ' => {
+                                    self.buffer.push_back(Token::Whitespace)
+                                }
+                                '\t' => self.buffer.push_back(Token::Tab),
+                                _ => unreachable!(),
+                            };
                         }
                         self.buffer.pop_front()
                     } else {
diff --git a/fmt/src/tokens/tests.rs b/fmt/src/tokens/tests.rs
@@ -107,7 +107,7 @@ fn token_generation() {
 fn whitespaces() {
     let rule = r#"rule test {
         condition:
-            true
+        	true
     }"#;
 
     let events = CSTStream::from(Parser::new(rule.as_bytes()));
@@ -144,10 +144,7 @@ fn whitespaces() {
             Whitespace,
             Whitespace,
             Whitespace,
-            Whitespace,
-            Whitespace,
-            Whitespace,
-            Whitespace,
+            Tab,
             Begin(SyntaxKind::BOOLEAN_EXPR),
             Begin(SyntaxKind::BOOLEAN_TERM),
             Keyword(b"true"),
diff --git a/site/content/docs/cli/commands.md b/site/content/docs/cli/commands.md
@@ -458,3 +458,13 @@ yr fmt <FILE>...
 Run in "check" mode. Doesn't modify any file, but exits error code 0 if the
 files are formatted correctly and no change is necessary, or error code 1
 if otherwise.
+
+### -t, --tab-size \<NUM_SPACES>\
+
+Tab size (in spaces) used in source files
+
+If the input contains tab characters, the formatter uses this value to determine how
+many spaces each tab represents. Setting this incorrectly can lead to misaligned
+formatting when the code mixes tabs and spaces.
+
+By default, it uses 4 spaces.