| 
 | 1 | +//! The crate contains tools for converting between byte offsets and line / column positions.  | 
 | 2 | +
  | 
 | 3 | +#![deny(clippy::use_self)]  | 
 | 4 | + | 
 | 5 | +use biome_text_size::TextSize;  | 
 | 6 | + | 
 | 7 | +mod line_index;  | 
 | 8 | + | 
 | 9 | +pub use line_index::LineIndex;  | 
 | 10 | + | 
 | 11 | +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]  | 
 | 12 | +pub enum WideEncoding {  | 
 | 13 | +    Utf16,  | 
 | 14 | +    Utf32,  | 
 | 15 | +}  | 
 | 16 | + | 
 | 17 | +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]  | 
 | 18 | +pub struct LineCol {  | 
 | 19 | +    /// Zero-based  | 
 | 20 | +    pub line: u32,  | 
 | 21 | +    /// Zero-based utf8 offset  | 
 | 22 | +    pub col: u32,  | 
 | 23 | +}  | 
 | 24 | + | 
 | 25 | +/// Deliberately not a generic type and different from `LineCol`.  | 
 | 26 | +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]  | 
 | 27 | +pub struct WideLineCol {  | 
 | 28 | +    /// Zero-based  | 
 | 29 | +    pub line: u32,  | 
 | 30 | +    /// Zero-based  | 
 | 31 | +    pub col: u32,  | 
 | 32 | +}  | 
 | 33 | + | 
 | 34 | +#[derive(Clone, Debug, Hash, PartialEq, Eq)]  | 
 | 35 | +pub struct WideChar {  | 
 | 36 | +    /// Start offset of a character inside a line, zero-based  | 
 | 37 | +    pub start: TextSize,  | 
 | 38 | +    /// End offset of a character inside a line, zero-based  | 
 | 39 | +    pub end: TextSize,  | 
 | 40 | +}  | 
 | 41 | + | 
 | 42 | +impl WideChar {  | 
 | 43 | +    /// Returns the length in 8-bit UTF-8 code units.  | 
 | 44 | +    fn len(&self) -> TextSize {  | 
 | 45 | +        self.end - self.start  | 
 | 46 | +    }  | 
 | 47 | + | 
 | 48 | +    /// Returns the length in UTF-16 or UTF-32 code units.  | 
 | 49 | +    fn wide_len(&self, enc: WideEncoding) -> usize {  | 
 | 50 | +        match enc {  | 
 | 51 | +            WideEncoding::Utf16 => {  | 
 | 52 | +                if self.len() == TextSize::from(4) {  | 
 | 53 | +                    2  | 
 | 54 | +                } else {  | 
 | 55 | +                    1  | 
 | 56 | +                }  | 
 | 57 | +            }  | 
 | 58 | + | 
 | 59 | +            WideEncoding::Utf32 => 1,  | 
 | 60 | +        }  | 
 | 61 | +    }  | 
 | 62 | +}  | 
 | 63 | + | 
 | 64 | +#[cfg(test)]  | 
 | 65 | +mod tests {  | 
 | 66 | +    use crate::WideEncoding::{Utf16, Utf32};  | 
 | 67 | +    use crate::WideLineCol;  | 
 | 68 | +    use crate::line_index::LineIndex;  | 
 | 69 | +    use crate::{LineCol, WideEncoding};  | 
 | 70 | +    use biome_text_size::TextSize;  | 
 | 71 | + | 
 | 72 | +    macro_rules! check_conversion {  | 
 | 73 | +        ($line_index:ident : $wide_line_col:expr => $text_size:expr ) => {  | 
 | 74 | +            let encoding = WideEncoding::Utf16;  | 
 | 75 | + | 
 | 76 | +            let line_col = $line_index.to_utf8(encoding, $wide_line_col);  | 
 | 77 | +            let offset = $line_index.offset(line_col);  | 
 | 78 | +            assert_eq!(offset, Some($text_size));  | 
 | 79 | + | 
 | 80 | +            let line_col = $line_index.line_col(offset.unwrap());  | 
 | 81 | +            let wide_line_col = $line_index.to_wide(encoding, line_col.unwrap());  | 
 | 82 | +            assert_eq!(wide_line_col, Some($wide_line_col));  | 
 | 83 | +        };  | 
 | 84 | +    }  | 
 | 85 | + | 
 | 86 | +    #[test]  | 
 | 87 | +    fn empty_string() {  | 
 | 88 | +        let line_index = LineIndex::new("");  | 
 | 89 | +        check_conversion!(line_index: WideLineCol { line: 0, col: 0 } => TextSize::from(0));  | 
 | 90 | +    }  | 
 | 91 | + | 
 | 92 | +    #[test]  | 
 | 93 | +    fn empty_line() {  | 
 | 94 | +        let line_index = LineIndex::new("\n\n");  | 
 | 95 | +        check_conversion!(line_index: WideLineCol { line: 1, col: 0 } => TextSize::from(1));  | 
 | 96 | +    }  | 
 | 97 | + | 
 | 98 | +    #[test]  | 
 | 99 | +    fn line_end() {  | 
 | 100 | +        let line_index = LineIndex::new("abc\ndef\nghi");  | 
 | 101 | +        check_conversion!(line_index: WideLineCol { line: 1, col: 3 } => TextSize::from(7));  | 
 | 102 | +    }  | 
 | 103 | + | 
 | 104 | +    #[test]  | 
 | 105 | +    fn out_of_bounds_line() {  | 
 | 106 | +        let line_index = LineIndex::new("abcde\nfghij\n");  | 
 | 107 | + | 
 | 108 | +        let offset = line_index.offset(LineCol { line: 5, col: 0 });  | 
 | 109 | +        assert!(offset.is_none());  | 
 | 110 | +    }  | 
 | 111 | + | 
 | 112 | +    #[test]  | 
 | 113 | +    fn unicode() {  | 
 | 114 | +        let line_index = LineIndex::new("'Jan 1, 2018 – Jan 1, 2019'");  | 
 | 115 | + | 
 | 116 | +        check_conversion!(line_index: WideLineCol { line: 0, col: 0 } => TextSize::from(0));  | 
 | 117 | +        check_conversion!(line_index: WideLineCol { line: 0, col: 1 } => TextSize::from(1));  | 
 | 118 | +        check_conversion!(line_index: WideLineCol { line: 0, col: 12 } => TextSize::from(12));  | 
 | 119 | +        check_conversion!(line_index: WideLineCol { line: 0, col: 13 } => TextSize::from(15));  | 
 | 120 | +        check_conversion!(line_index: WideLineCol { line: 0, col: 14 } => TextSize::from(18));  | 
 | 121 | +        check_conversion!(line_index: WideLineCol { line: 0, col: 15 } => TextSize::from(21));  | 
 | 122 | +        check_conversion!(line_index: WideLineCol { line: 0, col: 26 } => TextSize::from(32));  | 
 | 123 | +        check_conversion!(line_index: WideLineCol { line: 0, col: 27 } => TextSize::from(33));  | 
 | 124 | +    }  | 
 | 125 | + | 
 | 126 | +    #[ignore]  | 
 | 127 | +    #[test]  | 
 | 128 | +    fn test_every_chars() {  | 
 | 129 | +        let text: String = {  | 
 | 130 | +            let mut chars: Vec<char> = ((0 as char)..char::MAX).collect();  | 
 | 131 | +            chars.extend("\n".repeat(chars.len() / 16).chars());  | 
 | 132 | +            chars.into_iter().collect()  | 
 | 133 | +        };  | 
 | 134 | + | 
 | 135 | +        let line_index = LineIndex::new(&text);  | 
 | 136 | + | 
 | 137 | +        let mut lin_col = LineCol { line: 0, col: 0 };  | 
 | 138 | +        let mut col_utf16 = 0;  | 
 | 139 | +        let mut col_utf32 = 0;  | 
 | 140 | +        for (offset, char) in text.char_indices() {  | 
 | 141 | +            let got_offset = line_index.offset(lin_col).unwrap();  | 
 | 142 | +            assert_eq!(usize::from(got_offset), offset);  | 
 | 143 | + | 
 | 144 | +            let got_lin_col = line_index.line_col(got_offset).unwrap();  | 
 | 145 | +            assert_eq!(got_lin_col, lin_col);  | 
 | 146 | + | 
 | 147 | +            for enc in [Utf16, Utf32] {  | 
 | 148 | +                let wide_lin_col = line_index.to_wide(enc, lin_col).unwrap();  | 
 | 149 | +                let got_lin_col = line_index.to_utf8(enc, wide_lin_col);  | 
 | 150 | +                assert_eq!(got_lin_col, lin_col);  | 
 | 151 | + | 
 | 152 | +                let want_col = match enc {  | 
 | 153 | +                    Utf16 => col_utf16,  | 
 | 154 | +                    Utf32 => col_utf32,  | 
 | 155 | +                };  | 
 | 156 | +                assert_eq!(wide_lin_col.col, want_col)  | 
 | 157 | +            }  | 
 | 158 | + | 
 | 159 | +            if char == '\n' {  | 
 | 160 | +                lin_col.line += 1;  | 
 | 161 | +                lin_col.col = 0;  | 
 | 162 | +                col_utf16 = 0;  | 
 | 163 | +                col_utf32 = 0;  | 
 | 164 | +            } else {  | 
 | 165 | +                lin_col.col += char.len_utf8() as u32;  | 
 | 166 | +                col_utf16 += char.len_utf16() as u32;  | 
 | 167 | +                col_utf32 += 1;  | 
 | 168 | +            }  | 
 | 169 | +        }  | 
 | 170 | +    }  | 
 | 171 | +}  | 
0 commit comments