Skip to content

Commit 6bfe2e2

Browse files
committed
fix: indentation issues in code formatter.
This fixes multiple issues in the formatter while handling comments in code that uses tabs for indentation. It introduces a new `--tab-size` argument to the `yr fmt` command that allows to specify the tab size used in the input code.
1 parent 6b9c294 commit 6bfe2e2

File tree

9 files changed

+157
-26
lines changed

9 files changed

+157
-26
lines changed

cli/src/commands/fmt.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use clap::{arg, value_parser, ArgAction, ArgMatches, Command};
77
use yara_x_fmt::Formatter;
88

99
use crate::config::Config;
10-
use crate::help::FMT_CHECK_MODE;
10+
use crate::help;
1111

1212
pub fn fmt() -> Command {
1313
super::command("fmt")
@@ -19,14 +19,26 @@ pub fn fmt() -> Command {
1919
.value_parser(value_parser!(PathBuf))
2020
.action(ArgAction::Append),
2121
)
22-
.arg(arg!(-c --check "Run in 'check' mode").long_help(FMT_CHECK_MODE))
22+
.arg(
23+
arg!(-c --check "Run in 'check' mode")
24+
.long_help(help::FMT_CHECK_MODE),
25+
)
26+
.arg(
27+
arg!(-t - -"tab-size" <NUM_SPACES>)
28+
.help("Tab size (in spaces) used in source files")
29+
.long_help(help::FMT_TAB_SIZE)
30+
.default_value("4")
31+
.value_parser(value_parser!(usize)),
32+
)
2333
}
2434

2535
pub fn exec_fmt(args: &ArgMatches, config: &Config) -> anyhow::Result<()> {
2636
let files = args.get_many::<PathBuf>("FILE").unwrap();
2737
let check = args.get_flag("check");
38+
let tab_size = args.get_one::<usize>("tab-size").unwrap();
2839

2940
let formatter = Formatter::new()
41+
.input_tab_size(*tab_size)
3042
.align_metadata(config.fmt.meta.align_values)
3143
.align_patterns(config.fmt.patterns.align_values)
3244
.indent_section_headers(config.fmt.rule.indent_section_headers)

cli/src/help.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ pub const FMT_CHECK_MODE: &str = r#"Run in 'check' mode
8686
Doesn't modify the files. Exits with 0 if files are formatted correctly. Exits
8787
with 1 if formatting is required."#;
8888

89+
pub const FMT_TAB_SIZE: &str = r#"Tab size (in spaces) used in source files
90+
91+
If the input contains tab characters, the formatter uses this value to determine how
92+
many spaces each tab represents. Setting this incorrectly can lead to misaligned
93+
formatting when the code mixes tabs and spaces."#;
94+
8995
pub const FIX_ENCODING_LONG_HELP: &str = r#"Convert source files to UTF-8
9096
9197
YARA-X is stricter that YARA with respect to invalid UTF-8 characters in source

fmt/src/comments.rs

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@ use crate::tokens::{Token, TokenStream};
2424
/// ```
2525
///
2626
/// This processor must be used with a token stream that still retains the
27-
/// original spacing of the source code, because it needs the spacing for
28-
/// determining the original indentation of the comment. For example:
27+
/// original spacing of the source code (but with tabs replaced by spaces),
28+
/// because it needs the spacing for determining the original indentation
29+
/// of the comment. For example:
2930
///
3031
/// ```text
3132
/// rule test {
@@ -73,6 +74,7 @@ where
7374
start_of_input: bool,
7475
end_of_input: bool,
7576
indentation: usize,
77+
tab_size: usize,
7678
}
7779

7880
/// States used in [`CommentProcessor::process_input_buffer`]
@@ -103,9 +105,18 @@ where
103105
start_of_input: true,
104106
end_of_input: false,
105107
indentation: 0,
108+
tab_size: 4,
106109
}
107110
}
108111

112+
/// Number of spaces in a tab.
113+
///
114+
/// The default is `4`.
115+
pub fn tab_size(mut self, n: usize) -> Self {
116+
self.tab_size = n;
117+
self
118+
}
119+
109120
fn push_comment(
110121
&mut self,
111122
comment_lines: Vec<Vec<u8>>,
@@ -154,7 +165,11 @@ where
154165
State::PreComment { leading_newline } => {
155166
match self.input_buffer.pop_front() {
156167
Some(token @ Token::Whitespace) => {
157-
self.indentation += token.len();
168+
self.indentation += 1;
169+
self.output_buffer.push_back(token);
170+
}
171+
Some(token @ Token::Tab) => {
172+
self.indentation += self.tab_size;
158173
self.output_buffer.push_back(token);
159174
}
160175
// A newline has been found while in PreComment state,
@@ -174,6 +189,7 @@ where
174189
lines: split_comment_lines(
175190
comment,
176191
self.indentation,
192+
self.tab_size,
177193
),
178194
};
179195
self.indentation += token.len();
@@ -189,8 +205,11 @@ where
189205
leading_newline,
190206
indentation,
191207
} => match self.input_buffer.pop_front() {
192-
Some(token @ Token::Whitespace) => {
193-
self.indentation += token.len();
208+
Some(Token::Whitespace) => {
209+
self.indentation += 1;
210+
}
211+
Some(Token::Tab) => {
212+
self.indentation += self.tab_size;
194213
}
195214
// Newline found while in the Comment state. If this is the
196215
// first newline after the comment, the trailing_newline
@@ -241,8 +260,12 @@ where
241260
Some(Token::Comment(comment)) => {
242261
if *indentation == self.indentation {
243262
lines.append(
244-
split_comment_lines(comment, *indentation)
245-
.as_mut(),
263+
split_comment_lines(
264+
comment,
265+
*indentation,
266+
self.tab_size,
267+
)
268+
.as_mut(),
246269
);
247270
*trailing_newline = false;
248271
} else {
@@ -258,6 +281,7 @@ where
258281
lines: split_comment_lines(
259282
comment,
260283
self.indentation,
284+
self.tab_size,
261285
),
262286
};
263287
}
@@ -331,7 +355,7 @@ where
331355
/// Splits a multi-line comment into lines.
332356
///
333357
/// Also removes the specified number of whitespaces from the beginning of
334-
/// each line, except the first one.
358+
/// each line.
335359
///
336360
/// This is necessary because when a multi-line comment that uses the
337361
/// `/* comment */` syntax is indented, the comment itself contains some spaces
@@ -346,16 +370,31 @@ where
346370
/// Notice how the comment contains some spaces (here represented by
347371
/// `<-- indentation -->`) that should be removed/adjusted when the comment
348372
/// is re-indented.
349-
fn split_comment_lines(comment: &[u8], indentation: usize) -> Vec<Vec<u8>> {
373+
fn split_comment_lines(
374+
comment: &[u8],
375+
indentation: usize,
376+
tab_size: usize,
377+
) -> Vec<Vec<u8>> {
350378
let comment = BStr::new(comment);
351-
let indent = b" ".repeat(indentation);
352379
let mut result = Vec::new();
353380
for line in comment.lines() {
354-
if let Some(line_no_indent) = line.strip_prefix(indent.as_slice()) {
355-
result.push(line_no_indent.to_vec())
356-
} else {
357-
result.push(line.to_owned())
381+
let mut i = 0;
382+
let mut comment_start = 0;
383+
for (start, _, ch) in line.char_indices() {
384+
if i >= indentation {
385+
comment_start = start;
386+
break;
387+
}
388+
match ch {
389+
' ' => i += 1,
390+
'\t' => i += tab_size,
391+
_ => {
392+
comment_start = start;
393+
break;
394+
}
395+
}
358396
}
397+
result.push(line.get(comment_start..).unwrap_or_default().to_vec());
359398
}
360399
result
361400
}

fmt/src/lib.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ pub struct Formatter {
7171
newline_before_curly_brace: bool,
7272
empty_line_before_section_header: bool,
7373
empty_line_after_section_header: bool,
74+
tab_size: usize,
7475
}
7576

7677
impl Default for Formatter {
@@ -92,6 +93,7 @@ impl Formatter {
9293
newline_before_curly_brace: false,
9394
empty_line_before_section_header: true,
9495
empty_line_after_section_header: false,
96+
tab_size: 4,
9597
}
9698
}
9799

@@ -239,6 +241,19 @@ impl Formatter {
239241
self
240242
}
241243

244+
/// Specifies the tab size (in spaces) expected in the unformatted source
245+
/// code.
246+
///
247+
/// If the input contains tab characters, the formatter uses this value to
248+
/// determine how many spaces each tab represents. Setting this incorrectly
249+
/// can lead to misaligned formatting when the code mixes tabs and spaces.
250+
///
251+
/// Defaults to `4`.
252+
pub fn input_tab_size(mut self, tab_size: usize) -> Self {
253+
self.tab_size = tab_size;
254+
self
255+
}
256+
242257
/// Specify if newline should be added before the opening curly brace in a
243258
/// rule declaration. If false the rule will look like this:
244259
///
@@ -379,7 +394,8 @@ impl Formatter {
379394
where
380395
I: TokenStream<'a> + 'a,
381396
{
382-
let tokens = comments::CommentProcessor::new(input);
397+
let tokens =
398+
comments::CommentProcessor::new(input).tab_size(self.tab_size);
383399

384400
// Remove all whitespaces from the original source.
385401
let tokens = processor::Processor::new(tokens).add_rule(
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
rule test_1 {
2+
condition:
3+
/*
4+
Test
5+
*/
6+
uint16be(0) == 0x4d5a
7+
}
8+
9+
rule test_2 {
10+
condition:
11+
/*
12+
Test
13+
*/
14+
/*
15+
Test
16+
*/
17+
/*
18+
Test
19+
*/
20+
uint16be(0) == 0x4d5a
21+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
rule test_1
2+
{
3+
condition:
4+
/*
5+
Test
6+
*/
7+
uint16be(0) == 0x4d5a
8+
}
9+
10+
rule test_2
11+
{
12+
condition:
13+
/*
14+
Test
15+
*/
16+
/*
17+
Test
18+
*/
19+
/*
20+
Test
21+
*/
22+
uint16be(0) == 0x4d5a
23+
}

fmt/src/tokens/mod.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -503,15 +503,22 @@ where
503503
let token_bytes = &self.source[span.range()];
504504
// The whitespace token has a different treatment because
505505
// the parser returns a single whitespace token when
506-
// multiple whitespaces appear together. Here we separate
507-
// them into individual spaces.
506+
// multiple whitespaces appear together, and tabs are also
507+
// treated as whitespaces. Here we separate each whitespace
508+
// or tab into its own token.
508509
return if kind == SyntaxKind::WHITESPACE {
509510
// SAFETY: It's safe to assume that the whitespace
510511
// token is composed of valid UTF-8 characters. The
511512
// tokenizer guarantees this.
512513
let s = unsafe { from_utf8_unchecked(token_bytes) };
513-
for _ in s.chars() {
514-
self.buffer.push_back(Token::Whitespace);
514+
for ch in s.chars() {
515+
match ch {
516+
' ' => {
517+
self.buffer.push_back(Token::Whitespace)
518+
}
519+
'\t' => self.buffer.push_back(Token::Tab),
520+
_ => unreachable!(),
521+
};
515522
}
516523
self.buffer.pop_front()
517524
} else {

fmt/src/tokens/tests.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ fn token_generation() {
107107
fn whitespaces() {
108108
let rule = r#"rule test {
109109
condition:
110-
true
110+
true
111111
}"#;
112112

113113
let events = CSTStream::from(Parser::new(rule.as_bytes()));
@@ -144,10 +144,7 @@ fn whitespaces() {
144144
Whitespace,
145145
Whitespace,
146146
Whitespace,
147-
Whitespace,
148-
Whitespace,
149-
Whitespace,
150-
Whitespace,
147+
Tab,
151148
Begin(SyntaxKind::BOOLEAN_EXPR),
152149
Begin(SyntaxKind::BOOLEAN_TERM),
153150
Keyword(b"true"),

site/content/docs/cli/commands.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,3 +458,13 @@ yr fmt <FILE>...
458458
Run in "check" mode. Doesn't modify any file, but exits error code 0 if the
459459
files are formatted correctly and no change is necessary, or error code 1
460460
if otherwise.
461+
462+
### -t, --tab-size \<NUM_SPACES>\
463+
464+
Tab size (in spaces) used in source files
465+
466+
If the input contains tab characters, the formatter uses this value to determine how
467+
many spaces each tab represents. Setting this incorrectly can lead to misaligned
468+
formatting when the code mixes tabs and spaces.
469+
470+
By default, it uses 4 spaces.

0 commit comments

Comments
 (0)