Skip to content

Commit 27c0854

Browse files
authored
Merge pull request #70 from gengzongjie/master
Fix #69
2 parents d78f65a + aeeacb4 commit 27c0854

File tree

13 files changed

+58
-40
lines changed

13 files changed

+58
-40
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.directory
22
*.swp
33
*.swo
4-
4+
.idea
55
input/
66
data/
77
example/testdata/

example/unigram/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,13 @@ func main() {
4242

4343
func testTokenization(model tokenizer.Model, text string) {
4444
fmt.Printf("Tokenizing: %q\n", text)
45-
45+
4646
tokens, err := model.Tokenize(text)
4747
if err != nil {
4848
fmt.Printf("Error tokenizing: %v\n", err)
4949
return
5050
}
51-
51+
5252
fmt.Printf("Tokens (%d):\n", len(tokens))
5353
for i, token := range tokens {
5454
fmt.Printf(" %d. ID=%d, Value=%q, Offsets=%v\n", i, token.Id, token.Value, token.Offsets)

go.mod

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
module github.com/sugarme/tokenizer
22

3-
go 1.23
3+
go 1.23.0
4+
5+
toolchain go1.24.4
46

57
require (
68
github.com/emirpasic/gods v1.18.1
9+
github.com/patrickmn/go-cache v2.1.0+incompatible
710
github.com/rivo/uniseg v0.4.7
811
github.com/schollz/progressbar/v2 v2.15.0
912
github.com/sugarme/regexpset v0.0.0-20200920021344-4d4ec8eaf93c

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc
55
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
66
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
77
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
8+
github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
9+
github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
810
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
911
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
1012
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=

model/bpe/trainer.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken) {
8585
btb.Config.SpecialTokens = tokens
8686
}
8787

88-
//LimitAlphabet set the alphabet limit
88+
// LimitAlphabet set the alphabet limit
8989
func (btb *BpeTrainerBuilder) LimitAlphabet(limit int) {
9090
btb.Config.LimitAlphabet = &limit
9191
}
@@ -123,10 +123,12 @@ func (btb *BpeTrainerBuilder) Build() *BpeTrainer {
123123
// mapping of words to word counts.
124124
//
125125
// Example:
126-
// wordCounts := map[string]int = {
127-
// {"Hello", 1},
128-
// {"World", 1},
129-
// }
126+
//
127+
// wordCounts := map[string]int = {
128+
// {"Hello", 1},
129+
// {"World", 1},
130+
// }
131+
//
130132
// trainer := NewBPETrainer()
131133
// model, specialTokens := trainer.Train(wordCounts)
132134
type BpeTrainer struct {

model/unigram/unigram.go

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,20 @@ package unigram
33
import (
44
"encoding/json"
55
"fmt"
6+
Catch "github.com/patrickmn/go-cache"
7+
"github.com/sugarme/tokenizer"
8+
"github.com/sugarme/tokenizer/util"
69
"math"
710
"os"
811
"path/filepath"
912
"strings"
13+
"time"
1014
"unicode/utf8"
15+
)
1116

12-
"github.com/sugarme/tokenizer"
13-
"github.com/sugarme/tokenizer/util"
17+
const (
18+
CacheExpiredTime = 5
19+
CacheCleanTime = 10
1420
)
1521

1622
// TokenScore represents a token and its score in the Unigram model
@@ -37,7 +43,7 @@ type Unigram struct {
3743
bytesFallback bool
3844
fuseUnk bool
3945
// Cache for tokenization
40-
cache map[string][]string
46+
cache *Catch.Cache
4147
}
4248

4349
// UnigramBuilder can be used to create a Unigram model with a custom configuration
@@ -103,7 +109,7 @@ func (ub *UnigramBuilder) Build() (*Unigram, error) {
103109
unkID: ub.config.unkID,
104110
bytesFallback: ub.config.bytesFallback,
105111
fuseUnk: ub.config.fuseUnk,
106-
cache: make(map[string][]string),
112+
cache: Catch.New(CacheExpiredTime*time.Minute, CacheCleanTime*time.Minute),
107113
}, nil
108114
}
109115

@@ -239,14 +245,17 @@ func (u *Unigram) Save(dir string, prefixOpt ...string) error {
239245
// Tokenize tokenizes the given sequence into multiple tokens
240246
func (u *Unigram) Tokenize(sequence string) ([]tokenizer.Token, error) {
241247
// Check cache first
242-
if tokens, ok := u.cache[sequence]; ok {
248+
data, ok := u.cache.Get(sequence)
249+
if ok {
250+
tokens := data.([]string)
243251
return u.tokensToTokenizer(tokens, sequence), nil
244252
}
245253

246254
// If byte fallback is enabled, always use it
247255
if u.bytesFallback {
248256
tokens := u.tokenizeWithByteFallback(sequence)
249-
u.cache[sequence] = tokens
257+
u.cache.Set(sequence, tokens, CacheExpiredTime*time.Minute)
258+
250259
return u.tokensToTokenizer(tokens, sequence), nil
251260
}
252261

@@ -255,9 +264,7 @@ func (u *Unigram) Tokenize(sequence string) ([]tokenizer.Token, error) {
255264
if err != nil {
256265
return nil, err
257266
}
258-
259-
// Cache the result
260-
u.cache[sequence] = tokens
267+
u.cache.Set(sequence, tokens, CacheExpiredTime)
261268

262269
return u.tokensToTokenizer(tokens, sequence), nil
263270
}

model/unigram/unigram_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
package unigram
22

33
import (
4-
"testing"
54
"reflect"
5+
"testing"
66

77
"github.com/sugarme/tokenizer/util"
88
)

normalizer/normalized.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,11 @@ func (n *NormalizedString) TransformRange(inputRange *Range, changeMap []ChangeM
590590
align = n.alignments[idx-1]
591591
}
592592
} else {
593-
align = n.alignments[idx]
593+
if idx >= len(n.alignments) {
594+
align = n.alignments[len(n.alignments)-1]
595+
} else {
596+
align = n.alignments[idx]
597+
}
594598
}
595599

596600
// If we are replacing a character, find it and compute the change in size

pretokenizer/metaspace.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ func NewMetaspace(replacement string, addPrefixSpace bool) *Metaspace {
3636
if addPrefixSpace {
3737
scheme = Always
3838
}
39-
39+
4040
return &Metaspace{
4141
Replacement: replacement,
4242
PrependScheme: scheme,
@@ -49,7 +49,7 @@ func NewMetaspace(replacement string, addPrefixSpace bool) *Metaspace {
4949
func NewMetaspaceWithScheme(replacement string, scheme PrependScheme) *Metaspace {
5050
// Set AddPrefixSpace for backward compatibility
5151
addPrefixSpace := scheme != Never
52-
52+
5353
return &Metaspace{
5454
Replacement: replacement,
5555
PrependScheme: scheme,

pretrained/gpt2.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ import (
1515
// GPT2 loads GPT2 (small) tokenizer from vocab and merges files.
1616
//
1717
// Params:
18-
// - addPrefixSpace: set whether to add a leading space to the first word.
19-
// It allows to treat the leading word just as any other words.
20-
// - trimOffsets: set Whether the post processing step should trim offsets
21-
// to avoid including whitespaces.
18+
// - addPrefixSpace: set whether to add a leading space to the first word.
19+
// It allows to treat the leading word just as any other words.
20+
// - trimOffsets: set Whether the post processing step should trim offsets
21+
// to avoid including whitespaces.
2222
//
2323
// Special tokens:
2424
// - cls-token: "<s>"

0 commit comments

Comments
 (0)