@@ -3,14 +3,20 @@ package unigram
3
3
import (
4
4
"encoding/json"
5
5
"fmt"
6
+ Catch "github.com/patrickmn/go-cache"
7
+ "github.com/sugarme/tokenizer"
8
+ "github.com/sugarme/tokenizer/util"
6
9
"math"
7
10
"os"
8
11
"path/filepath"
9
12
"strings"
13
+ "time"
10
14
"unicode/utf8"
15
+ )
11
16
12
- "github.com/sugarme/tokenizer"
13
- "github.com/sugarme/tokenizer/util"
17
+ const (
18
+ CacheExpiredTime = 5
19
+ CacheCleanTime = 10
14
20
)
15
21
16
22
// TokenScore represents a token and its score in the Unigram model
@@ -37,7 +43,7 @@ type Unigram struct {
37
43
bytesFallback bool
38
44
fuseUnk bool
39
45
// Cache for tokenization
40
- cache map [ string ][] string
46
+ cache * Catch. Cache
41
47
}
42
48
43
49
// UnigramBuilder can be used to create a Unigram model with a custom configuration
@@ -103,7 +109,7 @@ func (ub *UnigramBuilder) Build() (*Unigram, error) {
103
109
unkID : ub .config .unkID ,
104
110
bytesFallback : ub .config .bytesFallback ,
105
111
fuseUnk : ub .config .fuseUnk ,
106
- cache : make ( map [ string ][] string ),
112
+ cache : Catch . New ( CacheExpiredTime * time . Minute , CacheCleanTime * time . Minute ),
107
113
}, nil
108
114
}
109
115
@@ -239,14 +245,17 @@ func (u *Unigram) Save(dir string, prefixOpt ...string) error {
239
245
// Tokenize tokenizes the given sequence into multiple tokens
240
246
func (u * Unigram ) Tokenize (sequence string ) ([]tokenizer.Token , error ) {
241
247
// Check cache first
242
- if tokens , ok := u .cache [sequence ]; ok {
248
+ data , ok := u .cache .Get (sequence )
249
+ if ok {
250
+ tokens := data .([]string )
243
251
return u .tokensToTokenizer (tokens , sequence ), nil
244
252
}
245
253
246
254
// If byte fallback is enabled, always use it
247
255
if u .bytesFallback {
248
256
tokens := u .tokenizeWithByteFallback (sequence )
249
- u .cache [sequence ] = tokens
257
+ u .cache .Set (sequence , tokens , CacheExpiredTime * time .Minute )
258
+
250
259
return u .tokensToTokenizer (tokens , sequence ), nil
251
260
}
252
261
@@ -255,9 +264,7 @@ func (u *Unigram) Tokenize(sequence string) ([]tokenizer.Token, error) {
255
264
if err != nil {
256
265
return nil , err
257
266
}
258
-
259
- // Cache the result
260
- u .cache [sequence ] = tokens
267
+ u .cache .Set (sequence , tokens , CacheExpiredTime )
261
268
262
269
return u .tokensToTokenizer (tokens , sequence ), nil
263
270
}
0 commit comments