Merge pull request #70 from gengzongjie/master

sugarme · web-flow · commit 27c0854a9768 · 2025-09-18T20:16:41.000+10:00
Fix #69
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 .directory
 *.swp
 *.swo
-
+.idea
 input/
 data/
 example/testdata/
diff --git a/example/unigram/main.go b/example/unigram/main.go
@@ -42,13 +42,13 @@ func main() {
 
 func testTokenization(model tokenizer.Model, text string) {
 	fmt.Printf("Tokenizing: %q\n", text)
-	
+
 	tokens, err := model.Tokenize(text)
 	if err != nil {
 		fmt.Printf("Error tokenizing: %v\n", err)
 		return
 	}
-	
+
 	fmt.Printf("Tokens (%d):\n", len(tokens))
 	for i, token := range tokens {
 		fmt.Printf("  %d. ID=%d, Value=%q, Offsets=%v\n", i, token.Id, token.Value, token.Offsets)
diff --git a/go.mod b/go.mod
@@ -1,9 +1,12 @@
 module github.com/sugarme/tokenizer
 
-go 1.23
+go 1.23.0
+
+toolchain go1.24.4
 
 require (
 	github.com/emirpasic/gods v1.18.1
+	github.com/patrickmn/go-cache v2.1.0+incompatible
 	github.com/rivo/uniseg v0.4.7
 	github.com/schollz/progressbar/v2 v2.15.0
 	github.com/sugarme/regexpset v0.0.0-20200920021344-4d4ec8eaf93c
diff --git a/go.sum b/go.sum
@@ -5,6 +5,8 @@ github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc
 github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
 github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
 github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
+github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
+github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
diff --git a/model/bpe/trainer.go b/model/bpe/trainer.go
@@ -85,7 +85,7 @@ func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken) {
 	btb.Config.SpecialTokens = tokens
 }
 
-//LimitAlphabet set the alphabet limit
+// LimitAlphabet set the alphabet limit
 func (btb *BpeTrainerBuilder) LimitAlphabet(limit int) {
 	btb.Config.LimitAlphabet = &limit
 }
@@ -123,10 +123,12 @@ func (btb *BpeTrainerBuilder) Build() *BpeTrainer {
 // mapping of words to word counts.
 //
 // Example:
-// wordCounts := map[string]int = {
-// 	{"Hello", 1},
-// 	{"World", 1},
-// }
+//
+//	wordCounts := map[string]int = {
+//		{"Hello", 1},
+//		{"World", 1},
+//	}
+//
 // trainer := NewBPETrainer()
 // model, specialTokens := trainer.Train(wordCounts)
 type BpeTrainer struct {
diff --git a/model/unigram/unigram.go b/model/unigram/unigram.go
@@ -3,14 +3,20 @@ package unigram
 import (
 	"encoding/json"
 	"fmt"
+	Catch "github.com/patrickmn/go-cache"
+	"github.com/sugarme/tokenizer"
+	"github.com/sugarme/tokenizer/util"
 	"math"
 	"os"
 	"path/filepath"
 	"strings"
+	"time"
 	"unicode/utf8"
+)
 
-	"github.com/sugarme/tokenizer"
-	"github.com/sugarme/tokenizer/util"
+const (
+	CacheExpiredTime = 5
+	CacheCleanTime   = 10
 )
 
 // TokenScore represents a token and its score in the Unigram model
@@ -37,7 +43,7 @@ type Unigram struct {
 	bytesFallback bool
 	fuseUnk       bool
 	// Cache for tokenization
-	cache map[string][]string
+	cache *Catch.Cache
 }
 
 // UnigramBuilder can be used to create a Unigram model with a custom configuration
@@ -103,7 +109,7 @@ func (ub *UnigramBuilder) Build() (*Unigram, error) {
 		unkID:         ub.config.unkID,
 		bytesFallback: ub.config.bytesFallback,
 		fuseUnk:       ub.config.fuseUnk,
-		cache:         make(map[string][]string),
+		cache:         Catch.New(CacheExpiredTime*time.Minute, CacheCleanTime*time.Minute),
 	}, nil
 }
 
@@ -239,14 +245,17 @@ func (u *Unigram) Save(dir string, prefixOpt ...string) error {
 // Tokenize tokenizes the given sequence into multiple tokens
 func (u *Unigram) Tokenize(sequence string) ([]tokenizer.Token, error) {
 	// Check cache first
-	if tokens, ok := u.cache[sequence]; ok {
+	data, ok := u.cache.Get(sequence)
+	if ok {
+		tokens := data.([]string)
 		return u.tokensToTokenizer(tokens, sequence), nil
 	}
 
 	// If byte fallback is enabled, always use it
 	if u.bytesFallback {
 		tokens := u.tokenizeWithByteFallback(sequence)
-		u.cache[sequence] = tokens
+		u.cache.Set(sequence, tokens, CacheExpiredTime*time.Minute)
+
 		return u.tokensToTokenizer(tokens, sequence), nil
 	}
 
@@ -255,9 +264,7 @@ func (u *Unigram) Tokenize(sequence string) ([]tokenizer.Token, error) {
 	if err != nil {
 		return nil, err
 	}
-
-	// Cache the result
-	u.cache[sequence] = tokens
+	u.cache.Set(sequence, tokens, CacheExpiredTime)
 
 	return u.tokensToTokenizer(tokens, sequence), nil
 }
diff --git a/model/unigram/unigram_test.go b/model/unigram/unigram_test.go
@@ -1,8 +1,8 @@
 package unigram
 
 import (
-	"testing"
 	"reflect"
+	"testing"
 
 	"github.com/sugarme/tokenizer/util"
 )
diff --git a/normalizer/normalized.go b/normalizer/normalized.go
@@ -590,7 +590,11 @@ func (n *NormalizedString) TransformRange(inputRange *Range, changeMap []ChangeM
 				align = n.alignments[idx-1]
 			}
 		} else {
-			align = n.alignments[idx]
+			if idx >= len(n.alignments) {
+				align = n.alignments[len(n.alignments)-1]
+			} else {
+				align = n.alignments[idx]
+			}
 		}
 
 		// If we are replacing a character, find it and compute the change in size
diff --git a/pretokenizer/metaspace.go b/pretokenizer/metaspace.go
@@ -36,7 +36,7 @@ func NewMetaspace(replacement string, addPrefixSpace bool) *Metaspace {
 	if addPrefixSpace {
 		scheme = Always
 	}
-	
+
 	return &Metaspace{
 		Replacement:    replacement,
 		PrependScheme:  scheme,
@@ -49,7 +49,7 @@ func NewMetaspace(replacement string, addPrefixSpace bool) *Metaspace {
 func NewMetaspaceWithScheme(replacement string, scheme PrependScheme) *Metaspace {
 	// Set AddPrefixSpace for backward compatibility
 	addPrefixSpace := scheme != Never
-	
+
 	return &Metaspace{
 		Replacement:    replacement,
 		PrependScheme:  scheme,
diff --git a/pretrained/gpt2.go b/pretrained/gpt2.go
@@ -15,10 +15,10 @@ import (
 // GPT2 loads GPT2 (small) tokenizer from vocab and merges files.
 //
 // Params:
-// - addPrefixSpace: set whether to add a leading space to the first word.
-//   It allows to treat the leading word just as any other words.
-// - trimOffsets: set Whether the post processing step should trim offsets
-//   to avoid including whitespaces.
+//   - addPrefixSpace: set whether to add a leading space to the first word.
+//     It allows to treat the leading word just as any other words.
+//   - trimOffsets: set Whether the post processing step should trim offsets
+//     to avoid including whitespaces.
 //
 // Special tokens:
 // - cls-token: "<s>"
diff --git a/pretrained/padding.go b/pretrained/padding.go
@@ -95,19 +95,19 @@ func CreatePaddingParams(config map[string]interface{}) (*tokenizer.PaddingParam
 	// Get other parameters with fallbacks
 	var id, typeId int
 	var token string
-	
+
 	if idVal := params.Get("pad_id"); idVal != nil {
 		if fVal, ok := idVal.(float64); ok {
 			id = int(fVal)
 		}
 	}
-	
+
 	if typeIdVal := params.Get("pad_type_id"); typeIdVal != nil {
 		if fVal, ok := typeIdVal.(float64); ok {
 			typeId = int(fVal)
 		}
 	}
-	
+
 	if tokenVal := params.Get("pad_token"); tokenVal != nil {
 		if sVal, ok := tokenVal.(string); ok {
 			token = sVal
diff --git a/pretrained/pretokenizer.go b/pretrained/pretokenizer.go
@@ -87,7 +87,7 @@ func createMetaspacePreTokenizer(params *util.Params) (tokenizer.PreTokenizer, e
 	}
 
 	replacement := params.Get("replacement", "").(string)
-	
+
 	// Check for prepend_scheme parameter
 	var scheme pretokenizer.PrependScheme
 	if params.Has("prepend_scheme") {
@@ -102,10 +102,10 @@ func createMetaspacePreTokenizer(params *util.Params) (tokenizer.PreTokenizer, e
 		default:
 			return nil, fmt.Errorf("unknown prepend_scheme: %s", schemeStr)
 		}
-		
+
 		return pretokenizer.NewMetaspaceWithScheme(replacement, scheme), nil
 	}
-	
+
 	// Fallback to add_prefix_space for backward compatibility
 	addPrefixSpace := params.Get("add_prefix_space", false).(bool)
 	return pretokenizer.NewMetaspace(replacement, addPrefixSpace), nil
diff --git a/pretrained/roberta.go b/pretrained/roberta.go
@@ -15,10 +15,10 @@ import (
 // RobertaBase loads pretrained RoBERTa tokenizer.
 //
 // Params:
-// - addPrefixSpace: set whether to add a leading space to the first word.
-//   It allows to treat the leading word just as any other words.
-// - trimOffsets: set Whether the post processing step should trim offsets
-//   to avoid including whitespaces.
+//   - addPrefixSpace: set whether to add a leading space to the first word.
+//     It allows to treat the leading word just as any other words.
+//   - trimOffsets: set Whether the post processing step should trim offsets
+//     to avoid including whitespaces.
 //
 // Special tokens:
 // - cls-token: "<s>"
@@ -67,10 +67,10 @@ func RobertaBase(addPrefixSpace, trimOffsets bool) *tokenizer.Tokenizer {
 // RobertaBaseSquad2 loads pretrained RoBERTa fine-tuned SQuAD Question Answering tokenizer.
 //
 // Params:
-// - addPrefixSpace: set whether to add a leading space to the first word.
-//   It allows to treat the leading word just as any other words.
-// - trimOffsets: set Whether the post processing step should trim offsets
-//   to avoid including whitespaces.
+//   - addPrefixSpace: set whether to add a leading space to the first word.
+//     It allows to treat the leading word just as any other words.
+//   - trimOffsets: set Whether the post processing step should trim offsets
+//     to avoid including whitespaces.
 //
 // Special tokens:
 // - cls-token: "<s>"

-Original file line number
+Diff line change
@@ @@ -1,7 +1,7 @@ @@
 .directory
 *.swp
 *.swo
+-
 +.idea
 input/
 data/
 example/testdata/
Original file line number	Diff line number	Diff line change
`@@ -1,8 +1,8 @@`
`1`	`1`	`package unigram`
`2`	`2`
`3`	`3`	`import (`
`4`		`- "testing"`
`5`	`4`	`"reflect"`
	`5`	`+ "testing"`
`6`	`6`
`7`	`7`	`"github.com/sugarme/tokenizer/util"`
`8`	`8`	`)`
Original file line number	Diff line number	Diff line change
`@@ -590,7 +590,11 @@ func (n NormalizedString) TransformRange(inputRange Range, changeMap []ChangeM`
`590`	`590`	`align = n.alignments[idx-1]`
`591`	`591`	`}`
`592`	`592`	`} else {`
`593`		`- align = n.alignments[idx]`
	`593`	`+ if idx >= len(n.alignments) {`
	`594`	`+ align = n.alignments[len(n.alignments)-1]`
	`595`	`+ } else {`
	`596`	`+ align = n.alignments[idx]`
	`597`	`+ }`
`594`	`598`	`}`
`595`	`599`
`596`	`600`	`// If we are replacing a character, find it and compute the change in size`