diff --git a/.golangci.yml b/.golangci.yml index 91f313b1a..98e943676 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -48,6 +48,7 @@ linters: - recvcheck - tenv - err113 + - prealloc linters-settings: gocyclo: @@ -62,7 +63,6 @@ linters-settings: # - (Must)?NewLexer$ exclude_godoc_examples: false - issues: exclude-dirs: - _examples @@ -71,19 +71,19 @@ issues: exclude-use-default: false exclude: # Captured by errcheck. - - '^(G104|G204):' + - "^(G104|G204):" # Very commonly not checked. - 'Error return value of .(.*\.Help|.*\.MarkFlagRequired|(os\.)?std(out|err)\..*|.*Close|.*Flush|os\.Remove(All)?|.*printf?|os\.(Un)?Setenv). is not checked' - 'exported method (.*\.MarshalJSON|.*\.UnmarshalJSON|.*\.EntityURN|.*\.GoString|.*\.Pos) should have comment or be unexported' - - 'composite literal uses unkeyed fields' + - "composite literal uses unkeyed fields" - 'declaration of "err" shadows declaration' - - 'should not use dot imports' - - 'Potential file inclusion via variable' - - 'should have comment or be unexported' - - 'comment on exported var .* should be of the form' - - 'at least one file in a package should have a package comment' - - 'string literal contains the Unicode' - - 'methods on the same type should have the same receiver name' - - '_TokenType_name should be _TokenTypeName' - - '`_TokenType_map` should be `_TokenTypeMap`' - - 'rewrite if-else to switch statement' + - "should not use dot imports" + - "Potential file inclusion via variable" + - "should have comment or be unexported" + - "comment on exported var .* should be of the form" + - "at least one file in a package should have a package comment" + - "string literal contains the Unicode" + - "methods on the same type should have the same receiver name" + - "_TokenType_name should be _TokenTypeName" + - "`_TokenType_map` should be `_TokenTypeMap`" + - "rewrite if-else to switch statement" diff --git a/_tools/exercise/main.go b/_tools/exercise/main.go index b22c7f2b8..a9cf18067 100644 --- a/_tools/exercise/main.go +++ b/_tools/exercise/main.go @@ -5,10 +5,11 @@ import ( "io/ioutil" "os" + "gopkg.in/alecthomas/kingpin.v3-unstable" + "github.com/alecthomas/chroma/v2/formatters" "github.com/alecthomas/chroma/v2/lexers" "github.com/alecthomas/chroma/v2/styles" - "gopkg.in/alecthomas/kingpin.v3-unstable" ) var ( diff --git a/bin/.gosimports-0.3.8.pkg b/bin/.gosimports-0.3.8.pkg new file mode 120000 index 000000000..383f4511d --- /dev/null +++ b/bin/.gosimports-0.3.8.pkg @@ -0,0 +1 @@ +hermit \ No newline at end of file diff --git a/bin/gosimports b/bin/gosimports new file mode 120000 index 000000000..61d5bc3a0 --- /dev/null +++ b/bin/gosimports @@ -0,0 +1 @@ +.gosimports-0.3.8.pkg \ No newline at end of file diff --git a/cmd/chroma/go.mod b/cmd/chroma/go.mod index fde62750d..8dc038023 100644 --- a/cmd/chroma/go.mod +++ b/cmd/chroma/go.mod @@ -1,6 +1,6 @@ module github.com/alecthomas/chroma/v2/cmd/chroma -go 1.22 +go 1.23 toolchain go1.25.1 diff --git a/cmd/chroma/main.go b/cmd/chroma/main.go index cf3916b7a..3257bd922 100644 --- a/cmd/chroma/main.go +++ b/cmd/chroma/main.go @@ -393,7 +393,10 @@ func format(ctx *kong.Context, w io.Writer, style *chroma.Style, it chroma.Itera func check(filename string, it chroma.Iterator) { line, col := 1, 0 - for token := it(); token != chroma.EOF; token = it() { + for token := range it { + if token == chroma.EOF { + break + } if token.Type == chroma.Error { fmt.Printf("%s:%d:%d %q\n", filename, line, col, token.String()) } diff --git a/cmd/chromad/go.mod b/cmd/chromad/go.mod index 2870b96f6..fcd87488e 100644 --- a/cmd/chromad/go.mod +++ b/cmd/chromad/go.mod @@ -1,6 +1,6 @@ module github.com/alecthomas/chroma/v2/cmd/chromad -go 1.22 +go 1.23 toolchain go1.25.1 diff --git a/coalesce.go b/coalesce.go index f5048951a..9729ca54b 100644 --- a/coalesce.go +++ b/coalesce.go @@ -1,18 +1,23 @@ package chroma +import "iter" + // Coalesce is a Lexer interceptor that collapses runs of common types into a single token. func Coalesce(lexer Lexer) Lexer { return &coalescer{lexer} } type coalescer struct{ Lexer } -func (d *coalescer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { - var prev Token +func (d *coalescer) Tokenise(options *TokeniseOptions, text string) (iter.Seq[Token], error) { it, err := d.Lexer.Tokenise(options, text) if err != nil { return nil, err } - return func() Token { - for token := it(); token != (EOF); token = it() { + return func(yield func(Token) bool) { + var prev Token + for token := range it { + if token == EOF { + break + } if len(token.Value) == 0 { continue } @@ -22,14 +27,15 @@ func (d *coalescer) Tokenise(options *TokeniseOptions, text string) (Iterator, e if prev.Type == token.Type && len(prev.Value) < 8192 { prev.Value += token.Value } else { - out := prev + if !yield(prev) { + return + } prev = token - return out } } } - out := prev - prev = EOF - return out + if prev != EOF { + yield(prev) + } }, nil } diff --git a/delegate.go b/delegate.go index 298f2dbbd..c63b40ceb 100644 --- a/delegate.go +++ b/delegate.go @@ -2,6 +2,7 @@ package chroma import ( "bytes" + "iter" ) type delegatingLexer struct { @@ -58,7 +59,7 @@ type insertion struct { tokens []Token } -func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint: gocognit +func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (iter.Seq[Token], error) { // nolint: gocognit tokens, err := Tokenise(Coalesce(d.language), options, text) if err != nil { return nil, err diff --git a/delegate_test.go b/delegate_test.go index 083896afa..207fba4e6 100644 --- a/delegate_test.go +++ b/delegate_test.go @@ -1,6 +1,7 @@ package chroma import ( + "slices" "testing" assert "github.com/alecthomas/assert/v2" @@ -104,7 +105,7 @@ func TestDelegate(t *testing.T) { t.Run(test.name, func(t *testing.T) { it, err := delegate.Tokenise(nil, test.source) assert.NoError(t, err) - actual := it.Tokens() + actual := slices.Collect(it) assert.Equal(t, test.expected, actual) }) } diff --git a/emitters.go b/emitters.go index 1097a7576..c435b498b 100644 --- a/emitters.go +++ b/emitters.go @@ -2,12 +2,13 @@ package chroma import ( "fmt" + "iter" ) // An Emitter takes group matches and returns tokens. type Emitter interface { // Emit tokens for the given regex groups. - Emit(groups []string, state *LexerState) Iterator + Emit(groups []string, state *LexerState) iter.Seq[Token] } // ValidatingEmitter is an Emitter that can validate against a compiled rule. @@ -23,10 +24,10 @@ type SerialisableEmitter interface { } // EmitterFunc is a function that is an Emitter. -type EmitterFunc func(groups []string, state *LexerState) Iterator +type EmitterFunc func(groups []string, state *LexerState) iter.Seq[Token] // Emit tokens for groups. -func (e EmitterFunc) Emit(groups []string, state *LexerState) Iterator { +func (e EmitterFunc) Emit(groups []string, state *LexerState) iter.Seq[Token] { return e(groups, state) } @@ -52,8 +53,8 @@ func (b *byGroupsEmitter) ValidateEmitter(rule *CompiledRule) error { return nil } -func (b *byGroupsEmitter) Emit(groups []string, state *LexerState) Iterator { - iterators := make([]Iterator, 0, len(groups)-1) +func (b *byGroupsEmitter) Emit(groups []string, state *LexerState) iter.Seq[Token] { + iterators := make([]iter.Seq[Token], 0, len(groups)-1) if len(b.Emitters) != len(groups)-1 { iterators = append(iterators, Error.Emit(groups, state)) // panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters)) @@ -69,8 +70,8 @@ func (b *byGroupsEmitter) Emit(groups []string, state *LexerState) Iterator { // ByGroupNames emits a token for each named matching group in the rule's regex. func ByGroupNames(emitters map[string]Emitter) Emitter { - return EmitterFunc(func(groups []string, state *LexerState) Iterator { - iterators := make([]Iterator, 0, len(state.NamedGroups)-1) + return EmitterFunc(func(groups []string, state *LexerState) iter.Seq[Token] { + iterators := make([]iter.Seq[Token], 0, len(state.NamedGroups)-1) if len(state.NamedGroups)-1 == 0 { if emitter, ok := emitters[`0`]; ok { iterators = append(iterators, emitter.Emit(groups, state)) @@ -147,7 +148,7 @@ type usingByGroup struct { } func (u *usingByGroup) EmitterKind() string { return "usingbygroup" } -func (u *usingByGroup) Emit(groups []string, state *LexerState) Iterator { +func (u *usingByGroup) Emit(groups []string, state *LexerState) iter.Seq[Token] { // bounds check if len(u.Emitters) != len(groups)-1 { panic("UsingByGroup expects number of emitters to be the same as len(groups)-1") @@ -157,7 +158,7 @@ func (u *usingByGroup) Emit(groups []string, state *LexerState) Iterator { sublexer := state.Registry.Get(groups[u.SublexerNameGroup]) // build iterators - iterators := make([]Iterator, len(groups)-1) + iterators := make([]iter.Seq[Token], len(groups)-1) for i, group := range groups[1:] { if i == u.CodeGroup-1 && sublexer != nil { var err error @@ -176,7 +177,7 @@ func (u *usingByGroup) Emit(groups []string, state *LexerState) Iterator { // // This Emitter is not serialisable. func UsingLexer(lexer Lexer) Emitter { - return EmitterFunc(func(groups []string, _ *LexerState) Iterator { + return EmitterFunc(func(groups []string, _ *LexerState) iter.Seq[Token] { it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0]) if err != nil { panic(err) @@ -191,7 +192,7 @@ type usingEmitter struct { func (u *usingEmitter) EmitterKind() string { return "using" } -func (u *usingEmitter) Emit(groups []string, state *LexerState) Iterator { +func (u *usingEmitter) Emit(groups []string, state *LexerState) iter.Seq[Token] { if state.Registry == nil { panic(fmt.Sprintf("no LexerRegistry available for Using(%q)", u.Lexer)) } @@ -219,7 +220,7 @@ type usingSelfEmitter struct { func (u *usingSelfEmitter) EmitterKind() string { return "usingself" } -func (u *usingSelfEmitter) Emit(groups []string, state *LexerState) Iterator { +func (u *usingSelfEmitter) Emit(groups []string, state *LexerState) iter.Seq[Token] { it, err := state.Lexer.Tokenise(&TokeniseOptions{State: u.State, Nested: true}, groups[0]) if err != nil { panic(err) diff --git a/formatter.go b/formatter.go index 00dd5d8df..53635a951 100644 --- a/formatter.go +++ b/formatter.go @@ -2,6 +2,7 @@ package chroma import ( "io" + "iter" ) // A Formatter for Chroma lexers. @@ -9,15 +10,15 @@ type Formatter interface { // Format returns a formatting function for tokens. // // If the iterator panics, the Formatter should recover. - Format(w io.Writer, style *Style, iterator Iterator) error + Format(w io.Writer, style *Style, iterator iter.Seq[Token]) error } // A FormatterFunc is a Formatter implemented as a function. // // Guards against iterator panics. -type FormatterFunc func(w io.Writer, style *Style, iterator Iterator) error +type FormatterFunc func(w io.Writer, style *Style, iterator iter.Seq[Token]) error -func (f FormatterFunc) Format(w io.Writer, s *Style, it Iterator) (err error) { // nolint +func (f FormatterFunc) Format(w io.Writer, s *Style, it iter.Seq[Token]) (err error) { // nolint defer func() { if perr := recover(); perr != nil { err = perr.(error) @@ -30,7 +31,7 @@ type recoveringFormatter struct { Formatter } -func (r recoveringFormatter) Format(w io.Writer, s *Style, it Iterator) (err error) { +func (r recoveringFormatter) Format(w io.Writer, s *Style, it iter.Seq[Token]) (err error) { defer func() { if perr := recover(); perr != nil { err = perr.(error) diff --git a/formatters/api.go b/formatters/api.go index 9ca0d01dd..c2a53e439 100644 --- a/formatters/api.go +++ b/formatters/api.go @@ -2,6 +2,7 @@ package formatters import ( "io" + "iter" "sort" "github.com/alecthomas/chroma/v2" @@ -11,8 +12,11 @@ import ( var ( // NoOp formatter. - NoOp = Register("noop", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, iterator chroma.Iterator) error { - for t := iterator(); t != chroma.EOF; t = iterator() { + NoOp = Register("noop", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, iterator iter.Seq[chroma.Token]) error { + for t := range iterator { + if t == chroma.EOF { + break + } if _, err := io.WriteString(w, t.Value); err != nil { return err } diff --git a/formatters/html/html.go b/formatters/html/html.go index c1c8875b2..438a488d8 100644 --- a/formatters/html/html.go +++ b/formatters/html/html.go @@ -4,6 +4,8 @@ import ( "fmt" "html" "io" + "iter" + "slices" "sort" "strconv" "strings" @@ -221,8 +223,8 @@ func (h highlightRanges) Len() int { return len(h) } func (h highlightRanges) Swap(i, j int) { h[i], h[j] = h[j], h[i] } func (h highlightRanges) Less(i, j int) bool { return h[i][0] < h[j][0] } -func (f *Formatter) Format(w io.Writer, style *chroma.Style, iterator chroma.Iterator) (err error) { - return f.writeHTML(w, style, iterator.Tokens()) +func (f *Formatter) Format(w io.Writer, style *chroma.Style, iterator iter.Seq[chroma.Token]) (err error) { + return f.writeHTML(w, style, slices.Collect(iterator)) } // We deliberately don't use html/template here because it is two orders of magnitude slower (benchmarked). diff --git a/formatters/json.go b/formatters/json.go index 436d3ce8c..1b0d256ae 100644 --- a/formatters/json.go +++ b/formatters/json.go @@ -4,17 +4,21 @@ import ( "encoding/json" "fmt" "io" + "iter" "github.com/alecthomas/chroma/v2" ) // JSON formatter outputs the raw token structures as JSON. -var JSON = Register("json", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, it chroma.Iterator) error { +var JSON = Register("json", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, it iter.Seq[chroma.Token]) error { if _, err := fmt.Fprintln(w, "["); err != nil { return err } i := 0 - for t := it(); t != chroma.EOF; t = it() { + for t := range it { + if t == chroma.EOF { + break + } if i > 0 { if _, err := fmt.Fprintln(w, ","); err != nil { return err diff --git a/formatters/svg/svg.go b/formatters/svg/svg.go index 6d457f90a..938b79c36 100644 --- a/formatters/svg/svg.go +++ b/formatters/svg/svg.go @@ -6,8 +6,10 @@ import ( "errors" "fmt" "io" + "iter" "os" "path" + "slices" "strings" "github.com/alecthomas/chroma/v2" @@ -61,8 +63,8 @@ type Formatter struct { fontFormat FontFormat } -func (f *Formatter) Format(w io.Writer, style *chroma.Style, iterator chroma.Iterator) (err error) { - f.writeSVG(w, style, iterator.Tokens()) +func (f *Formatter) Format(w io.Writer, style *chroma.Style, iterator iter.Seq[chroma.Token]) (err error) { + f.writeSVG(w, style, slices.Collect(iterator)) return err } diff --git a/formatters/tokens.go b/formatters/tokens.go index 3bdd57ccf..66f161851 100644 --- a/formatters/tokens.go +++ b/formatters/tokens.go @@ -3,13 +3,17 @@ package formatters import ( "fmt" "io" + "iter" "github.com/alecthomas/chroma/v2" ) // Tokens formatter outputs the raw token structures. -var Tokens = Register("tokens", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, it chroma.Iterator) error { - for t := it(); t != chroma.EOF; t = it() { +var Tokens = Register("tokens", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, it iter.Seq[chroma.Token]) error { + for t := range it { + if t == chroma.EOF { + break + } if _, err := fmt.Fprintln(w, t.GoString()); err != nil { return err } diff --git a/formatters/tty_indexed.go b/formatters/tty_indexed.go index d48fb993c..27881068e 100644 --- a/formatters/tty_indexed.go +++ b/formatters/tty_indexed.go @@ -2,6 +2,7 @@ package formatters import ( "io" + "iter" "math" "github.com/alecthomas/chroma/v2" @@ -237,9 +238,12 @@ type indexedTTYFormatter struct { table *ttyTable } -func (c *indexedTTYFormatter) Format(w io.Writer, style *chroma.Style, it chroma.Iterator) (err error) { +func (c *indexedTTYFormatter) Format(w io.Writer, style *chroma.Style, it iter.Seq[chroma.Token]) (err error) { theme := styleToEscapeSequence(c.table, style) - for token := it(); token != chroma.EOF; token = it() { + for token := range it { + if token == chroma.EOF { + break + } clr, ok := theme[token.Type] // This search mimics how styles.Get() is used in tty_truecolour.go. diff --git a/formatters/tty_indexed_test.go b/formatters/tty_indexed_test.go index 462194671..924530271 100644 --- a/formatters/tty_indexed_test.go +++ b/formatters/tty_indexed_test.go @@ -5,6 +5,7 @@ import ( "testing" assert "github.com/alecthomas/assert/v2" + "github.com/alecthomas/chroma/v2" ) diff --git a/formatters/tty_truecolour.go b/formatters/tty_truecolour.go index 43b096476..a532684c9 100644 --- a/formatters/tty_truecolour.go +++ b/formatters/tty_truecolour.go @@ -3,6 +3,7 @@ package formatters import ( "fmt" "io" + "iter" "regexp" "github.com/alecthomas/chroma/v2" @@ -44,9 +45,12 @@ func writeToken(w io.Writer, formatting string, text string) { } } -func trueColourFormatter(w io.Writer, style *chroma.Style, it chroma.Iterator) error { +func trueColourFormatter(w io.Writer, style *chroma.Style, it iter.Seq[chroma.Token]) error { style = clearBackground(style) - for token := it(); token != chroma.EOF; token = it() { + for token := range it { + if token == chroma.EOF { + break + } entry := style.Get(token.Type) if entry.IsZero() { fmt.Fprint(w, token.Value) diff --git a/go.mod b/go.mod index f4ce088c3..0753017d0 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/alecthomas/chroma/v2 -go 1.22 +go 1.23 require ( github.com/alecthomas/assert/v2 v2.11.0 diff --git a/iterator.go b/iterator.go index cf39bb577..ca93ad151 100644 --- a/iterator.go +++ b/iterator.go @@ -1,57 +1,34 @@ package chroma -import "strings" +import ( + "iter" + "strings" +) -// An Iterator across tokens. -// -// EOF will be returned at the end of the Token stream. -// -// If an error occurs within an Iterator, it may propagate this in a panic. Formatters should recover. -type Iterator func() Token - -// Tokens consumes all tokens from the iterator and returns them as a slice. -func (i Iterator) Tokens() []Token { - var out []Token - for t := i(); t != EOF; t = i() { - out = append(out, t) - } - return out -} - -// Stdlib converts a Chroma iterator to a Go 1.23-compatible iterator. -func (i Iterator) Stdlib() func(yield func(Token) bool) { +// Concaterator concatenates tokens from a series of iterators. +func Concaterator(iterators ...iter.Seq[Token]) iter.Seq[Token] { return func(yield func(Token) bool) { - for t := i(); t != EOF; t = i() { - if !yield(t) { - return + for _, it := range iterators { + for t := range it { + if t == EOF { + break + } + if !yield(t) { + return + } } } } } -// Concaterator concatenates tokens from a series of iterators. -func Concaterator(iterators ...Iterator) Iterator { - return func() Token { - for len(iterators) > 0 { - t := iterators[0]() - if t != EOF { - return t +// Literator converts a sequence of literal Tokens into an iter.Seq[Token]. +func Literator(tokens ...Token) iter.Seq[Token] { + return func(yield func(Token) bool) { + for _, token := range tokens { + if !yield(token) { + return } - iterators = iterators[1:] - } - return EOF - } -} - -// Literator converts a sequence of literal Tokens into an Iterator. -func Literator(tokens ...Token) Iterator { - return func() Token { - if len(tokens) == 0 { - return EOF } - token := tokens[0] - tokens = tokens[1:] - return token } } diff --git a/lexer.go b/lexer.go index 602db1c4f..8c95d9053 100644 --- a/lexer.go +++ b/lexer.go @@ -2,6 +2,7 @@ package chroma import ( "fmt" + "iter" "strings" ) @@ -112,8 +113,8 @@ type TokeniseOptions struct { type Lexer interface { // Config describing the features of the Lexer. Config() *Config - // Tokenise returns an Iterator over tokens in text. - Tokenise(options *TokeniseOptions, text string) (Iterator, error) + // Tokenise returns an iter.Seq[Token] over tokens in text. + Tokenise(options *TokeniseOptions, text string) (iter.Seq[Token], error) // SetRegistry sets the registry this Lexer is associated with. // // The registry should be used by the Lexer if it needs to look up other diff --git a/lexers/go_test.go b/lexers/go_test.go index 12128446d..62ee3aee7 100644 --- a/lexers/go_test.go +++ b/lexers/go_test.go @@ -4,6 +4,7 @@ import ( "testing" assert "github.com/alecthomas/assert/v2" + "github.com/alecthomas/chroma/v2" ) diff --git a/lexers/http.go b/lexers/http.go index b57cb1b84..b994e106a 100644 --- a/lexers/http.go +++ b/lexers/http.go @@ -1,6 +1,7 @@ package lexers import ( + "iter" "strings" . "github.com/alecthomas/chroma/v2" // nolint @@ -36,14 +37,14 @@ func httpRules() Rules { } } -func httpContentBlock(groups []string, state *LexerState) Iterator { +func httpContentBlock(groups []string, state *LexerState) iter.Seq[Token] { tokens := []Token{ {Generic, groups[0]}, } return Literator(tokens...) } -func httpHeaderBlock(groups []string, state *LexerState) Iterator { +func httpHeaderBlock(groups []string, state *LexerState) iter.Seq[Token] { tokens := []Token{ {Name, groups[1]}, {Text, groups[2]}, @@ -55,7 +56,7 @@ func httpHeaderBlock(groups []string, state *LexerState) Iterator { return Literator(tokens...) } -func httpContinuousHeaderBlock(groups []string, state *LexerState) Iterator { +func httpContinuousHeaderBlock(groups []string, state *LexerState) iter.Seq[Token] { tokens := []Token{ {Text, groups[1]}, {Literal, groups[2]}, @@ -68,64 +69,72 @@ func httpBodyContentTypeLexer(lexer Lexer) Lexer { return &httpBodyContentTyper{ type httpBodyContentTyper struct{ Lexer } -func (d *httpBodyContentTyper) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint: gocognit - var contentType string - var isContentType bool - var subIterator Iterator - +func (d *httpBodyContentTyper) Tokenise(options *TokeniseOptions, text string) (iter.Seq[Token], error) { // nolint: gocognit it, err := d.Lexer.Tokenise(options, text) if err != nil { return nil, err } - return func() Token { - token := it() + return func(yield func(Token) bool) { + var contentType string + var isContentType bool + var subIterator iter.Seq[Token] - if token == EOF { - if subIterator != nil { - return subIterator() + for token := range it { + if token == EOF { + break } - return EOF - } - switch { - case token.Type == Name && strings.ToLower(token.Value) == "content-type": - { - isContentType = true - } - case token.Type == Literal && isContentType: - { - isContentType = false - contentType = strings.TrimSpace(token.Value) - pos := strings.Index(contentType, ";") - if pos > 0 { - contentType = strings.TrimSpace(contentType[:pos]) + switch { + case token.Type == Name && strings.ToLower(token.Value) == "content-type": + { + isContentType = true } - } - case token.Type == Generic && contentType != "": - { - lexer := MatchMimeType(contentType) - - // application/calendar+xml can be treated as application/xml - // if there's not a better match. - if lexer == nil && strings.Contains(contentType, "+") { - slashPos := strings.Index(contentType, "/") - plusPos := strings.LastIndex(contentType, "+") - contentType = contentType[:slashPos+1] + contentType[plusPos+1:] - lexer = MatchMimeType(contentType) + case token.Type == Literal && isContentType: + { + isContentType = false + contentType = strings.TrimSpace(token.Value) + pos := strings.Index(contentType, ";") + if pos > 0 { + contentType = strings.TrimSpace(contentType[:pos]) + } } + case token.Type == Generic && contentType != "": + { + lexer := MatchMimeType(contentType) + + // application/calendar+xml can be treated as application/xml + // if there's not a better match. + if lexer == nil && strings.Contains(contentType, "+") { + slashPos := strings.Index(contentType, "/") + plusPos := strings.LastIndex(contentType, "+") + contentType = contentType[:slashPos+1] + contentType[plusPos+1:] + lexer = MatchMimeType(contentType) + } - if lexer == nil { - token.Type = Text - } else { - subIterator, err = lexer.Tokenise(nil, token.Value) - if err != nil { - panic(err) + if lexer == nil { + token.Type = Text + } else { + subIterator, err = lexer.Tokenise(nil, token.Value) + if err != nil { + panic(err) + } + // Emit tokens from the sub-iterator + for st := range subIterator { + if st == EOF { + break + } + if !yield(st) { + return + } + } + continue } - return EOF } } + if !yield(token) { + return + } } - return token }, nil } diff --git a/lexers/lexer_benchmark_test.go b/lexers/lexer_benchmark_test.go index acb4117ef..f36a18ba5 100644 --- a/lexers/lexer_benchmark_test.go +++ b/lexers/lexer_benchmark_test.go @@ -212,7 +212,10 @@ func Benchmark(b *testing.B) { for range b.N { it, err := lexers.GlobalLexerRegistry.Get("Java").Tokenise(nil, lexerBenchSource) assert.NoError(b, err) - for t := it(); t != chroma.EOF; t = it() { + for t := range it { + if t == chroma.EOF { + break + } } } } diff --git a/lexers/raku.go b/lexers/raku.go index da354dce6..c2c489d0d 100644 --- a/lexers/raku.go +++ b/lexers/raku.go @@ -1,6 +1,7 @@ package lexers import ( + "iter" "regexp" "slices" "strings" @@ -1505,8 +1506,8 @@ func makeRule(config ruleMakingConfig) *CompiledRule { // Emitter for colon pairs, changes token state based on key and brackets func colonPair(tokenClass TokenType) Emitter { - return EmitterFunc(func(groups []string, state *LexerState) Iterator { - iterators := []Iterator{} + return EmitterFunc(func(groups []string, state *LexerState) iter.Seq[Token] { + iterators := []iter.Seq[Token]{} tokens := []Token{ {Punctuation, state.NamedGroups[`colon`]}, {Punctuation, state.NamedGroups[`opening_delimiters`]}, @@ -1581,10 +1582,10 @@ func colonPair(tokenClass TokenType) Emitter { } // Emitter for quoting constructs, changes token state based on quote name and adverbs -func quote(groups []string, state *LexerState) Iterator { +func quote(groups []string, state *LexerState) iter.Seq[Token] { keyword := state.NamedGroups[`keyword`] adverbsStr := state.NamedGroups[`adverbs`] - iterators := []Iterator{} + iterators := []iter.Seq[Token]{} tokens := []Token{ {Keyword, keyword}, {StringAffix, adverbsStr}, @@ -1649,7 +1650,7 @@ func quote(groups []string, state *LexerState) Iterator { } // Emitter for pod config, tokenises the properties with "colon-pair-attribute" state -func podConfig(groups []string, state *LexerState) Iterator { +func podConfig(groups []string, state *LexerState) iter.Seq[Token] { // Tokenise pod config iterator, err := state.Lexer.Tokenise( &TokeniseOptions{ @@ -1665,8 +1666,8 @@ func podConfig(groups []string, state *LexerState) Iterator { } // Emitter for pod code, tokenises the code based on the lang specified -func podCode(groups []string, state *LexerState) Iterator { - iterators := []Iterator{} +func podCode(groups []string, state *LexerState) iter.Seq[Token] { + iterators := []iter.Seq[Token]{} tokens := []Token{ {Comment, state.NamedGroups[`ws`]}, {Keyword, state.NamedGroups[`keyword`]}, diff --git a/lexers/rst.go b/lexers/rst.go index 66ec03cdf..dfce0f860 100644 --- a/lexers/rst.go +++ b/lexers/rst.go @@ -1,6 +1,7 @@ package lexers import ( + "iter" "strings" . "github.com/alecthomas/chroma/v2" // nolint @@ -62,8 +63,8 @@ func restructuredtextRules() Rules { } } -func rstCodeBlock(groups []string, state *LexerState) Iterator { - iterators := []Iterator{} +func rstCodeBlock(groups []string, state *LexerState) iter.Seq[Token] { + iterators := []iter.Seq[Token]{} tokens := []Token{ {Punctuation, groups[1]}, {Text, groups[2]}, diff --git a/mutators_test.go b/mutators_test.go index ca346c24f..20d147024 100644 --- a/mutators_test.go +++ b/mutators_test.go @@ -1,6 +1,7 @@ package chroma import ( + "slices" "testing" assert "github.com/alecthomas/assert/v2" @@ -53,5 +54,5 @@ func TestCombine(t *testing.T) { it, err := l.Tokenise(nil, "hello world") assert.NoError(t, err) expected := []Token{{String, `hello`}, {Whitespace, ` `}, {Name, `world`}} - assert.Equal(t, expected, it.Tokens()) + assert.Equal(t, expected, slices.Collect(it)) } diff --git a/regexp.go b/regexp.go index c0e5e1081..f15e22d75 100644 --- a/regexp.go +++ b/regexp.go @@ -3,6 +3,7 @@ package chroma import ( "encoding/json" "fmt" + "iter" "os" "path/filepath" "regexp" @@ -40,7 +41,10 @@ func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, erro if err != nil { return nil, err } - for t := it(); t != EOF; t = it() { + for t := range it { + if t == EOF { + break + } out = append(out, t) } return out, nil @@ -178,7 +182,7 @@ type LexerState struct { NamedGroups map[string]string // Custum context for mutators. MutatorContext map[interface{}]interface{} - iteratorStack []Iterator + tokenStack [][]Token options *TokeniseOptions newlineAdded bool } @@ -193,26 +197,37 @@ func (l *LexerState) Get(key interface{}) interface{} { return l.MutatorContext[key] } -// Iterator returns the next Token from the lexer. -func (l *LexerState) Iterator() Token { // nolint: gocognit +// Iterator returns a Go iterator over tokens from the lexer. +func (l *LexerState) Iterator(yield func(Token) bool) { // nolint: gocognit trace := json.NewEncoder(os.Stderr) end := len(l.Text) if l.newlineAdded { end-- } + for l.Pos < end && len(l.Stack) > 0 { - // Exhaust the iterator stack, if any. - for len(l.iteratorStack) > 0 { - n := len(l.iteratorStack) - 1 - t := l.iteratorStack[n]() + // Exhaust the token stack, if any. + for len(l.tokenStack) > 0 { + n := len(l.tokenStack) - 1 + tokens := l.tokenStack[n] + if len(tokens) == 0 { + l.tokenStack = l.tokenStack[:n] + continue + } + // Take first token and remove it from the stack + t := tokens[0] + l.tokenStack[n] = tokens[1:] if t.Type == Ignore { continue } if t == EOF { - l.iteratorStack = l.iteratorStack[:n] + l.tokenStack = l.tokenStack[:n] continue } - return t + if !yield(t) { + return + } + continue // Check for more tokens on stack before processing rules } l.State = l.Stack[len(l.Stack)-1] @@ -256,7 +271,10 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit continue } l.Pos++ - return Token{Error, string(l.Text[l.Pos-1 : l.Pos])} + if !yield(Token{Error, string(l.Text[l.Pos-1 : l.Pos])}) { + return + } + continue } l.Rule = ruleIndex l.Groups = groups @@ -268,31 +286,49 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit } } if rule.Type != nil { - l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l)) + // Collect all tokens from the emitter and push them onto the stack + var tokens []Token + rule.Type.Emit(l.Groups, l)(func(t Token) bool { + tokens = append(tokens, t) + return true + }) + if len(tokens) > 0 { + l.tokenStack = append(l.tokenStack, tokens) + } } } - // Exhaust the IteratorStack, if any. - // Duplicate code, but eh. - for len(l.iteratorStack) > 0 { - n := len(l.iteratorStack) - 1 - t := l.iteratorStack[n]() + + // Exhaust the token stack, if any. + for len(l.tokenStack) > 0 { + n := len(l.tokenStack) - 1 + tokens := l.tokenStack[n] + if len(tokens) == 0 { + l.tokenStack = l.tokenStack[:n] + continue + } + // Take first token and remove it from the stack + t := tokens[0] + l.tokenStack[n] = tokens[1:] if t.Type == Ignore { continue } if t == EOF { - l.iteratorStack = l.iteratorStack[:n] + l.tokenStack = l.tokenStack[:n] continue } - return t + if !yield(t) { + return + } } // If we get to here and we still have text, return it as an error. if l.Pos != len(l.Text) && len(l.Stack) == 0 { value := string(l.Text[l.Pos:]) l.Pos = len(l.Text) - return Token{Type: Error, Value: value} + if !yield(Token{Type: Error, Value: value}) { + return + } } - return EOF } // RegexLexer is the default lexer implementation used in Chroma. @@ -456,7 +492,7 @@ func (r *RegexLexer) needRules() error { } // Tokenise text using lexer, returning an iterator. -func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { +func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (iter.Seq[Token], error) { err := r.needRules() if err != nil { return nil, err diff --git a/regexp_test.go b/regexp_test.go index 01e3a5090..1f70f77b0 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -1,6 +1,7 @@ package chroma import ( + "slices" "testing" assert "github.com/alecthomas/assert/v2" @@ -22,7 +23,7 @@ func TestNewlineAtEndOfFile(t *testing.T) { })) it, err := l.Tokenise(nil, `hello`) assert.NoError(t, err) - assert.Equal(t, []Token{{Keyword, "hello"}, {Whitespace, "\n"}}, it.Tokens()) + assert.Equal(t, []Token{{Keyword, "hello"}, {Whitespace, "\n"}}, slices.Collect(it)) l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo "root": { @@ -31,7 +32,7 @@ func TestNewlineAtEndOfFile(t *testing.T) { })) it, err = l.Tokenise(nil, `hello`) assert.NoError(t, err) - assert.Equal(t, []Token{{Error, "hello"}}, it.Tokens()) + assert.Equal(t, []Token{{Error, "hello"}}, slices.Collect(it)) } func TestMatchingAtStart(t *testing.T) { @@ -49,7 +50,7 @@ func TestMatchingAtStart(t *testing.T) { assert.NoError(t, err) assert.Equal(t, []Token{{Punctuation, "-"}, {NameEntity, "module"}, {Whitespace, " "}, {Operator, "->"}}, - it.Tokens()) + slices.Collect(it)) } func TestEnsureLFOption(t *testing.T) { @@ -68,7 +69,7 @@ func TestEnsureLFOption(t *testing.T) { {Whitespace, "\n"}, {Keyword, "world"}, {Whitespace, "\n"}, - }, it.Tokens()) + }, slices.Collect(it)) l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo "root": { @@ -85,7 +86,7 @@ func TestEnsureLFOption(t *testing.T) { {Whitespace, "\r\n"}, {Keyword, "world"}, {Whitespace, "\r"}, - }, it.Tokens()) + }, slices.Collect(it)) } func TestEnsureLFFunc(t *testing.T) { @@ -124,7 +125,7 @@ func TestByGroupNames(t *testing.T) { })) it, err := l.Tokenise(nil, `abc=123`) assert.NoError(t, err) - assert.Equal(t, []Token{{String, `abc`}, {Operator, `=`}, {String, `123`}}, it.Tokens()) + assert.Equal(t, []Token{{String, `abc`}, {Operator, `=`}, {String, `123`}}, slices.Collect(it)) l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo "root": { @@ -140,7 +141,7 @@ func TestByGroupNames(t *testing.T) { })) it, err = l.Tokenise(nil, `abc=123`) assert.NoError(t, err) - assert.Equal(t, []Token{{String, `abc`}, {Error, `=`}, {String, `123`}}, it.Tokens()) + assert.Equal(t, []Token{{String, `abc`}, {Error, `=`}, {String, `123`}}, slices.Collect(it)) l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo "root": { @@ -156,7 +157,7 @@ func TestByGroupNames(t *testing.T) { })) it, err = l.Tokenise(nil, `abc=123`) assert.NoError(t, err) - assert.Equal(t, []Token{{String, `abc123`}}, it.Tokens()) + assert.Equal(t, []Token{{String, `abc123`}}, slices.Collect(it)) l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo "root": { @@ -173,7 +174,7 @@ func TestByGroupNames(t *testing.T) { })) it, err = l.Tokenise(nil, `abc=123`) assert.NoError(t, err) - assert.Equal(t, []Token{{String, `abc`}, {Error, `=`}, {String, `123`}}, it.Tokens()) + assert.Equal(t, []Token{{String, `abc`}, {Error, `=`}, {String, `123`}}, slices.Collect(it)) l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo "root": { @@ -190,7 +191,7 @@ func TestByGroupNames(t *testing.T) { })) it, err = l.Tokenise(nil, `abc=123`) assert.NoError(t, err) - assert.Equal(t, []Token{{Error, `abc=123`}}, it.Tokens()) + assert.Equal(t, []Token{{Error, `abc=123`}}, slices.Collect(it)) } func TestIgnoreToken(t *testing.T) { @@ -201,5 +202,5 @@ func TestIgnoreToken(t *testing.T) { })) it, err := l.Tokenise(nil, ` hello `) assert.NoError(t, err) - assert.Equal(t, []Token{{Keyword, "hello"}, {TextWhitespace, "\n"}}, it.Tokens()) + assert.Equal(t, []Token{{Keyword, "hello"}, {TextWhitespace, "\n"}}, slices.Collect(it)) } diff --git a/remap.go b/remap.go index bcf5e66d1..ed6dbd8f9 100644 --- a/remap.go +++ b/remap.go @@ -1,5 +1,7 @@ package chroma +import "iter" + type remappingLexer struct { lexer Lexer mapper func(Token) []Token @@ -28,24 +30,22 @@ func (r *remappingLexer) Config() *Config { return r.lexer.Config() } -func (r *remappingLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { +func (r *remappingLexer) Tokenise(options *TokeniseOptions, text string) (iter.Seq[Token], error) { it, err := r.lexer.Tokenise(options, text) if err != nil { return nil, err } - var buffer []Token - return func() Token { - for { - if len(buffer) > 0 { - t := buffer[0] - buffer = buffer[1:] - return t - } - t := it() + return func(yield func(Token) bool) { + for t := range it { if t == EOF { - return t + break + } + mapped := r.mapper(t) + for _, mt := range mapped { + if !yield(mt) { + return + } } - buffer = r.mapper(t) } }, nil } diff --git a/remap_test.go b/remap_test.go index 3b76c42db..27807010c 100644 --- a/remap_test.go +++ b/remap_test.go @@ -1,6 +1,7 @@ package chroma import ( + "slices" "testing" assert "github.com/alecthomas/assert/v2" @@ -24,6 +25,6 @@ func TestRemappingLexer(t *testing.T) { {TextWhitespace, " "}, {Name, "print"}, {TextWhitespace, " "}, {Keyword, "else"}, {TextWhitespace, " "}, {Name, "end"}, } - actual := it.Tokens() + actual := slices.Collect(it) assert.Equal(t, expected, actual) } diff --git a/types.go b/types.go index 3009f9809..0ade871dd 100644 --- a/types.go +++ b/types.go @@ -1,5 +1,7 @@ package chroma +import "iter" + //go:generate enumer -text -type TokenType // TokenType is the type of token to highlight. @@ -348,7 +350,7 @@ func (t TokenType) InSubCategory(other TokenType) bool { return t/100 == other/100 } -func (t TokenType) Emit(groups []string, _ *LexerState) Iterator { +func (t TokenType) Emit(groups []string, _ *LexerState) iter.Seq[Token] { return Literator(Token{Type: t, Value: groups[0]}) }