Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ linters:
- recvcheck
- tenv
- err113
- prealloc

linters-settings:
gocyclo:
Expand All @@ -62,7 +63,6 @@ linters-settings:
# - (Must)?NewLexer$
exclude_godoc_examples: false


issues:
exclude-dirs:
- _examples
Expand All @@ -71,19 +71,19 @@ issues:
exclude-use-default: false
exclude:
# Captured by errcheck.
- '^(G104|G204):'
- "^(G104|G204):"
# Very commonly not checked.
- 'Error return value of .(.*\.Help|.*\.MarkFlagRequired|(os\.)?std(out|err)\..*|.*Close|.*Flush|os\.Remove(All)?|.*printf?|os\.(Un)?Setenv). is not checked'
- 'exported method (.*\.MarshalJSON|.*\.UnmarshalJSON|.*\.EntityURN|.*\.GoString|.*\.Pos) should have comment or be unexported'
- 'composite literal uses unkeyed fields'
- "composite literal uses unkeyed fields"
- 'declaration of "err" shadows declaration'
- 'should not use dot imports'
- 'Potential file inclusion via variable'
- 'should have comment or be unexported'
- 'comment on exported var .* should be of the form'
- 'at least one file in a package should have a package comment'
- 'string literal contains the Unicode'
- 'methods on the same type should have the same receiver name'
- '_TokenType_name should be _TokenTypeName'
- '`_TokenType_map` should be `_TokenTypeMap`'
- 'rewrite if-else to switch statement'
- "should not use dot imports"
- "Potential file inclusion via variable"
- "should have comment or be unexported"
- "comment on exported var .* should be of the form"
- "at least one file in a package should have a package comment"
- "string literal contains the Unicode"
- "methods on the same type should have the same receiver name"
- "_TokenType_name should be _TokenTypeName"
- "`_TokenType_map` should be `_TokenTypeMap`"
- "rewrite if-else to switch statement"
2 changes: 1 addition & 1 deletion cmd/chroma/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/alecthomas/chroma/v2/cmd/chroma

go 1.22
go 1.23

toolchain go1.25.1

Expand Down
5 changes: 4 additions & 1 deletion cmd/chroma/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,10 @@ func format(ctx *kong.Context, w io.Writer, style *chroma.Style, it chroma.Itera

func check(filename string, it chroma.Iterator) {
line, col := 1, 0
for token := it(); token != chroma.EOF; token = it() {
for token := range it {
if token == chroma.EOF {
break
}
if token.Type == chroma.Error {
fmt.Printf("%s:%d:%d %q\n", filename, line, col, token.String())
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/chromad/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/alecthomas/chroma/v2/cmd/chromad

go 1.22
go 1.23

toolchain go1.25.1

Expand Down
20 changes: 12 additions & 8 deletions coalesce.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ func Coalesce(lexer Lexer) Lexer { return &coalescer{lexer} }
type coalescer struct{ Lexer }

func (d *coalescer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
var prev Token
it, err := d.Lexer.Tokenise(options, text)
if err != nil {
return nil, err
}
return func() Token {
for token := it(); token != (EOF); token = it() {
return func(yield func(Token) bool) {
var prev Token
for token := range it {
if token == EOF {
break
}
if len(token.Value) == 0 {
continue
}
Expand All @@ -22,14 +25,15 @@ func (d *coalescer) Tokenise(options *TokeniseOptions, text string) (Iterator, e
if prev.Type == token.Type && len(prev.Value) < 8192 {
prev.Value += token.Value
} else {
out := prev
if !yield(prev) {
return
}
prev = token
return out
}
}
}
out := prev
prev = EOF
return out
if prev != EOF {
yield(prev)
}
}, nil
}
5 changes: 4 additions & 1 deletion formatters/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ import (
var (
// NoOp formatter.
NoOp = Register("noop", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, iterator chroma.Iterator) error {
for t := iterator(); t != chroma.EOF; t = iterator() {
for t := range iterator {
if t == chroma.EOF {
break
}
if _, err := io.WriteString(w, t.Value); err != nil {
return err
}
Expand Down
5 changes: 4 additions & 1 deletion formatters/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ var JSON = Register("json", chroma.FormatterFunc(func(w io.Writer, s *chroma.Sty
return err
}
i := 0
for t := it(); t != chroma.EOF; t = it() {
for t := range it {
if t == chroma.EOF {
break
}
if i > 0 {
if _, err := fmt.Fprintln(w, ","); err != nil {
return err
Expand Down
5 changes: 4 additions & 1 deletion formatters/tokens.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ import (

// Tokens formatter outputs the raw token structures.
var Tokens = Register("tokens", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, it chroma.Iterator) error {
for t := it(); t != chroma.EOF; t = it() {
for t := range it {
if t == chroma.EOF {
break
}
if _, err := fmt.Fprintln(w, t.GoString()); err != nil {
return err
}
Expand Down
5 changes: 4 additions & 1 deletion formatters/tty_indexed.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,10 @@ type indexedTTYFormatter struct {

func (c *indexedTTYFormatter) Format(w io.Writer, style *chroma.Style, it chroma.Iterator) (err error) {
theme := styleToEscapeSequence(c.table, style)
for token := it(); token != chroma.EOF; token = it() {
for token := range it {
if token == chroma.EOF {
break
}
clr, ok := theme[token.Type]

// This search mimics how styles.Get() is used in tty_truecolour.go.
Expand Down
5 changes: 4 additions & 1 deletion formatters/tty_truecolour.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ func writeToken(w io.Writer, formatting string, text string) {

func trueColourFormatter(w io.Writer, style *chroma.Style, it chroma.Iterator) error {
style = clearBackground(style)
for token := it(); token != chroma.EOF; token = it() {
for token := range it {
if token == chroma.EOF {
break
}
entry := style.Get(token.Type)
if entry.IsZero() {
fmt.Fprint(w, token.Value)
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/alecthomas/chroma/v2

go 1.22
go 1.23

require (
github.com/alecthomas/assert/v2 v2.11.0
Expand Down
50 changes: 23 additions & 27 deletions iterator.go
Original file line number Diff line number Diff line change
@@ -1,57 +1,53 @@
package chroma

import "strings"
import (
"iter"
"strings"
)

// An Iterator across tokens.
//
// EOF will be returned at the end of the Token stream.
//
// If an error occurs within an Iterator, it may propagate this in a panic. Formatters should recover.
type Iterator func() Token
type Iterator iter.Seq[Token]

// Tokens consumes all tokens from the iterator and returns them as a slice.
func (i Iterator) Tokens() []Token {
var out []Token
for t := i(); t != EOF; t = i() {
for t := range i {
if t == EOF {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I don't see that this would be necessary: in fact I 'd strongly suggest it's not necessary to define an EOF token at all. Then this function can be as simple as:

func (i Iterator) Tokens() []Token {
    return slices.Collect(i)
}

which to my mind somewhat calls into question whether the Tokens method is worth having at all.
Maybe Iterator could be defined just as an alias:

type Iterator = iter.Seq[Token]

But then again, it's quite possibly worth preserving surface API compatibility even if the underlying representation changes.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This thought had crossed my mind and I don't exactly recall why there's an EOF token TBH, but there was a good reason for it at some point, so I left it in.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW the EOF token definitely seems a bit "surprising" to me. An analogy that springs to mind is that it feels a little like passing around length-delimited strings but still keeping a zero-byte at the end and asking everyone to ignore the last character.

I've not seen an example of this pattern before and I personally would think very carefully through (and explicitly document) the reasons for why it needs to be this way, assuming it really does.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It existed before to signify that the stream had reached EOF, which is obviously redundant with Go iterators.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I was aware of its previous use/need, but wondered if there was a reason you'd left it around when moving to the new iterator API.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see. No, this is basically a half hour PoC, not ready for merge. I was mostly pondering whether switching to iterators would be worth a major version bump. But in combination with some other cleanup, like eradicating EOF, it could be worthwhile I think.

break
}
out = append(out, t)
}
return out
}

// Stdlib converts a Chroma iterator to a Go 1.23-compatible iterator.
func (i Iterator) Stdlib() func(yield func(Token) bool) {
return func(yield func(Token) bool) {
for t := i(); t != EOF; t = i() {
if !yield(t) {
return
}
}
}
}

// Concaterator concatenates tokens from a series of iterators.
func Concaterator(iterators ...Iterator) Iterator {
return func() Token {
for len(iterators) > 0 {
t := iterators[0]()
if t != EOF {
return t
return func(yield func(Token) bool) {
for _, it := range iterators {
for t := range it {
if t == EOF {
break
}
if !yield(t) {
return
}
}
iterators = iterators[1:]
}
return EOF
}
}

// Literator converts a sequence of literal Tokens into an Iterator.
func Literator(tokens ...Token) Iterator {
return func() Token {
if len(tokens) == 0 {
return EOF
return func(yield func(Token) bool) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return slices.Values(tokens)

for _, token := range tokens {
if !yield(token) {
return
}
}
token := tokens[0]
tokens = tokens[1:]
return token
}
}

Expand Down
94 changes: 51 additions & 43 deletions lexers/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,63 +69,71 @@ func httpBodyContentTypeLexer(lexer Lexer) Lexer { return &httpBodyContentTyper{
type httpBodyContentTyper struct{ Lexer }

func (d *httpBodyContentTyper) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint: gocognit
var contentType string
var isContentType bool
var subIterator Iterator

it, err := d.Lexer.Tokenise(options, text)
if err != nil {
return nil, err
}

return func() Token {
token := it()
return func(yield func(Token) bool) {
var contentType string
var isContentType bool
var subIterator Iterator

if token == EOF {
if subIterator != nil {
return subIterator()
for token := range it {
if token == EOF {
break
}
return EOF
}

switch {
case token.Type == Name && strings.ToLower(token.Value) == "content-type":
{
isContentType = true
}
case token.Type == Literal && isContentType:
{
isContentType = false
contentType = strings.TrimSpace(token.Value)
pos := strings.Index(contentType, ";")
if pos > 0 {
contentType = strings.TrimSpace(contentType[:pos])
switch {
case token.Type == Name && strings.ToLower(token.Value) == "content-type":
{
isContentType = true
}
}
case token.Type == Generic && contentType != "":
{
lexer := MatchMimeType(contentType)

// application/calendar+xml can be treated as application/xml
// if there's not a better match.
if lexer == nil && strings.Contains(contentType, "+") {
slashPos := strings.Index(contentType, "/")
plusPos := strings.LastIndex(contentType, "+")
contentType = contentType[:slashPos+1] + contentType[plusPos+1:]
lexer = MatchMimeType(contentType)
case token.Type == Literal && isContentType:
{
isContentType = false
contentType = strings.TrimSpace(token.Value)
pos := strings.Index(contentType, ";")
if pos > 0 {
contentType = strings.TrimSpace(contentType[:pos])
}
}
case token.Type == Generic && contentType != "":
{
lexer := MatchMimeType(contentType)

// application/calendar+xml can be treated as application/xml
// if there's not a better match.
if lexer == nil && strings.Contains(contentType, "+") {
slashPos := strings.Index(contentType, "/")
plusPos := strings.LastIndex(contentType, "+")
contentType = contentType[:slashPos+1] + contentType[plusPos+1:]
lexer = MatchMimeType(contentType)
}

if lexer == nil {
token.Type = Text
} else {
subIterator, err = lexer.Tokenise(nil, token.Value)
if err != nil {
panic(err)
if lexer == nil {
token.Type = Text
} else {
subIterator, err = lexer.Tokenise(nil, token.Value)
if err != nil {
panic(err)
}
// Emit tokens from the sub-iterator
for st := range subIterator {
if st == EOF {
break
}
if !yield(st) {
return
}
}
continue
}
return EOF
}
}
if !yield(token) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given this is at the end of the function, I think this test is redundant.

return
}
}
return token
}, nil
}
5 changes: 4 additions & 1 deletion lexers/lexer_benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,10 @@ func Benchmark(b *testing.B) {
for range b.N {
it, err := lexers.GlobalLexerRegistry.Get("Java").Tokenise(nil, lexerBenchSource)
assert.NoError(b, err)
for t := it(); t != chroma.EOF; t = it() {
for t := range it {
if t == chroma.EOF {
break
}
}
}
}
Loading