Skip to content

Commit aa4cba7

Browse files
committed
update chardet
1 parent faeab6d commit aa4cba7

File tree

3 files changed

+234
-0
lines changed

3 files changed

+234
-0
lines changed

chardet/chardet.charamel.go

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
//go:build charamel
2+
3+
package chardet
4+
5+
import (
6+
"errors"
7+
8+
"github.com/gonejack/charamel"
9+
)
10+
11+
func init() {
12+
prefer(DetectEncodingByCharamel)
13+
}
14+
15+
var encodings = []charamel.Encoding{
16+
// UTF编码
17+
charamel.UTF8,
18+
charamel.UTF16,
19+
charamel.UTF16BE,
20+
charamel.UTF16LE,
21+
charamel.UTF32,
22+
charamel.UTF32BE,
23+
charamel.UTF32LE,
24+
25+
// ASCII
26+
charamel.ASCII,
27+
28+
// 西欧编码 (ISO-8859)
29+
charamel.LATIN1, // ISO-8859-1
30+
charamel.ISO88592, // ISO-8859-2
31+
charamel.ISO88593, // ISO-8859-3
32+
charamel.ISO88594, // ISO-8859-4
33+
charamel.ISO88595, // ISO-8859-5
34+
charamel.ISO88596, // ISO-8859-6
35+
charamel.ISO88597, // ISO-8859-7
36+
charamel.ISO88598, // ISO-8859-8
37+
charamel.ISO88599, // ISO-8859-9
38+
charamel.ISO885910, // ISO-8859-10
39+
charamel.ISO885911, // ISO-8859-11
40+
charamel.ISO885913, // ISO-8859-13
41+
charamel.ISO885914, // ISO-8859-14
42+
charamel.ISO885915, // ISO-8859-15
43+
charamel.ISO885916, // ISO-8859-16
44+
45+
// Windows编码 (CP1250-1258)
46+
charamel.CP1250,
47+
charamel.CP1251,
48+
charamel.CP1252,
49+
charamel.CP1253,
50+
charamel.CP1254,
51+
charamel.CP1255,
52+
charamel.CP1256,
53+
charamel.CP1257,
54+
charamel.CP1258,
55+
56+
// 中文编码
57+
charamel.GB2312,
58+
charamel.GBK,
59+
charamel.GB18030,
60+
charamel.BIG5,
61+
charamel.BIG5HKSCS,
62+
charamel.HZ, // hz-gb-2312
63+
64+
// 日文编码
65+
charamel.EUCJP, // euc-jp
66+
charamel.SHIFTJIS, // shift_jis
67+
charamel.ISO2022JP, // iso-2022-jp
68+
69+
// 韩文编码
70+
charamel.EUCKR, // euc-kr
71+
charamel.CP949, // windows-949
72+
charamel.ISO2022KR, // iso-2022-kr
73+
74+
// 俄文编码
75+
charamel.KOI8R, // koi8-r
76+
charamel.KOI8U, // koi8-u
77+
charamel.CP866, // cp866
78+
79+
// 泰文编码
80+
charamel.TIS620, // tis-620
81+
charamel.CP874, // windows-874
82+
83+
// Mac编码
84+
charamel.MACROMAN, // macintosh
85+
charamel.MACCYRILLIC, // x-mac-cyrillic
86+
}
87+
88+
func DetectEncodingByCharamel(dat []byte) (string, error) {
89+
d, err := charamel.NewDetector(encodings, 0)
90+
if err != nil {
91+
return "", err
92+
}
93+
v := d.Detect(dat)
94+
if v == nil {
95+
return "", errors.New("detect failed by github.com/gonejack/charamel")
96+
}
97+
return mapName(v), nil
98+
}
99+
100+
func mapName(encoding *charamel.Encoding) string {
101+
switch *encoding {
102+
// UTF编码
103+
case charamel.UTF8:
104+
return "utf-8"
105+
case charamel.UTF16:
106+
return "utf-16"
107+
case charamel.UTF16BE:
108+
return "utf-16be"
109+
case charamel.UTF16LE:
110+
return "utf-16le"
111+
case charamel.UTF32:
112+
return "utf-32"
113+
case charamel.UTF32BE:
114+
return "utf-32be"
115+
case charamel.UTF32LE:
116+
return "utf-32le"
117+
118+
// ASCII
119+
case charamel.ASCII:
120+
return "ascii"
121+
122+
// 西欧编码 (ISO-8859)
123+
case charamel.LATIN1:
124+
return "iso-8859-1"
125+
case charamel.ISO88592:
126+
return "iso-8859-2"
127+
case charamel.ISO88593:
128+
return "iso-8859-3"
129+
case charamel.ISO88594:
130+
return "iso-8859-4"
131+
case charamel.ISO88595:
132+
return "iso-8859-5"
133+
case charamel.ISO88596:
134+
return "iso-8859-6"
135+
case charamel.ISO88597:
136+
return "iso-8859-7"
137+
case charamel.ISO88598:
138+
return "iso-8859-8"
139+
case charamel.ISO88599:
140+
return "iso-8859-9"
141+
case charamel.ISO885910:
142+
return "iso-8859-10"
143+
case charamel.ISO885911:
144+
return "iso-8859-11"
145+
case charamel.ISO885913:
146+
return "iso-8859-13"
147+
case charamel.ISO885914:
148+
return "iso-8859-14"
149+
case charamel.ISO885915:
150+
return "iso-8859-15"
151+
case charamel.ISO885916:
152+
return "iso-8859-16"
153+
154+
// Windows编码
155+
case charamel.CP1250:
156+
return "cp1250"
157+
case charamel.CP1251:
158+
return "cp1251"
159+
case charamel.CP1252:
160+
return "cp1252"
161+
case charamel.CP1253:
162+
return "cp1253"
163+
case charamel.CP1254:
164+
return "cp1254"
165+
case charamel.CP1255:
166+
return "cp1255"
167+
case charamel.CP1256:
168+
return "cp1256"
169+
case charamel.CP1257:
170+
return "cp1257"
171+
case charamel.CP1258:
172+
return "cp1258"
173+
174+
// 中文编码
175+
case charamel.GB2312:
176+
return "gb2312"
177+
case charamel.GBK:
178+
return "gbk"
179+
case charamel.GB18030:
180+
return "gb18030"
181+
case charamel.BIG5:
182+
return "big5"
183+
case charamel.BIG5HKSCS:
184+
return "big5-hkscs"
185+
case charamel.HZ:
186+
return "hz-gb-2312"
187+
188+
// 日文编码
189+
case charamel.EUCJP:
190+
return "euc-jp"
191+
case charamel.SHIFTJIS:
192+
return "shift-jis"
193+
case charamel.ISO2022JP:
194+
return "iso-2022-jp"
195+
196+
// 韩文编码
197+
case charamel.EUCKR:
198+
return "euc-kr"
199+
case charamel.CP949:
200+
return "windows-949"
201+
case charamel.ISO2022KR:
202+
return "iso-2022-kr"
203+
204+
// 俄文编码
205+
case charamel.KOI8R:
206+
return "koi8-r"
207+
case charamel.KOI8U:
208+
return "koi8-u"
209+
case charamel.CP866:
210+
return "cp866"
211+
212+
// 泰文编码
213+
case charamel.TIS620:
214+
return "tis-620"
215+
case charamel.CP874:
216+
return "windows-874"
217+
218+
// Mac编码
219+
case charamel.MACROMAN:
220+
return "macintosh"
221+
case charamel.MACCYRILLIC:
222+
return "x-mac-cyrillic"
223+
224+
// 默认返回原始字符串
225+
default:
226+
return encoding.String()
227+
}
228+
}

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@ require (
88
github.com/alecthomas/kong v0.8.1
99
github.com/endeveit/enca v0.0.0-20160315071803-00fe968221ab
1010
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f
11+
github.com/gonejack/charamel v1.0.1
1112
github.com/wlynxg/chardet v1.0.0
1213
golang.org/x/text v0.14.0
1314
)
1415

1516
require (
17+
github.com/x448/float16 v0.8.4 // indirect
1618
go.uber.org/multierr v1.11.0 // indirect
1719
go.uber.org/zap v1.27.0 // indirect
1820
)

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ github.com/endeveit/enca v0.0.0-20160315071803-00fe968221ab h1:8sh8Pynho3gYrdzdb
1010
github.com/endeveit/enca v0.0.0-20160315071803-00fe968221ab/go.mod h1:p9sYlSrwy19GJyed1EXDwdZeL4rVBd1tPoPgDvs7U1Q=
1111
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
1212
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
13+
github.com/gonejack/charamel v1.0.1 h1:TKBCkAl0PoI+0s6jJJMRoLl2rLhuHL+BPcXvCCsa6qo=
14+
github.com/gonejack/charamel v1.0.1/go.mod h1:RQJBTqDLll8x8xAJvJAFhQSoOphw8NKZ+paIcDS0aLk=
1315
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
1416
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
1517
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
@@ -18,6 +20,8 @@ github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKs
1820
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
1921
github.com/wlynxg/chardet v1.0.0 h1:2gEgdmy/at4xIC+mOfNf1OFsb4LtnG9IcumfRXii/d0=
2022
github.com/wlynxg/chardet v1.0.0/go.mod h1:DgEUcneT6QieJ9qEhtRFOHWOjSNLPAo8lwUhjNopcFE=
23+
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
24+
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
2125
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
2226
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
2327
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=

0 commit comments

Comments
 (0)