|
| 1 | +//go:build charamel |
| 2 | + |
| 3 | +package chardet |
| 4 | + |
| 5 | +import ( |
| 6 | + "errors" |
| 7 | + |
| 8 | + "github.com/gonejack/charamel" |
| 9 | +) |
| 10 | + |
| 11 | +func init() { |
| 12 | + prefer(DetectEncodingByCharamel) |
| 13 | +} |
| 14 | + |
| 15 | +var encodings = []charamel.Encoding{ |
| 16 | + // UTF编码 |
| 17 | + charamel.UTF8, |
| 18 | + charamel.UTF16, |
| 19 | + charamel.UTF16BE, |
| 20 | + charamel.UTF16LE, |
| 21 | + charamel.UTF32, |
| 22 | + charamel.UTF32BE, |
| 23 | + charamel.UTF32LE, |
| 24 | + |
| 25 | + // ASCII |
| 26 | + charamel.ASCII, |
| 27 | + |
| 28 | + // 西欧编码 (ISO-8859) |
| 29 | + charamel.LATIN1, // ISO-8859-1 |
| 30 | + charamel.ISO88592, // ISO-8859-2 |
| 31 | + charamel.ISO88593, // ISO-8859-3 |
| 32 | + charamel.ISO88594, // ISO-8859-4 |
| 33 | + charamel.ISO88595, // ISO-8859-5 |
| 34 | + charamel.ISO88596, // ISO-8859-6 |
| 35 | + charamel.ISO88597, // ISO-8859-7 |
| 36 | + charamel.ISO88598, // ISO-8859-8 |
| 37 | + charamel.ISO88599, // ISO-8859-9 |
| 38 | + charamel.ISO885910, // ISO-8859-10 |
| 39 | + charamel.ISO885911, // ISO-8859-11 |
| 40 | + charamel.ISO885913, // ISO-8859-13 |
| 41 | + charamel.ISO885914, // ISO-8859-14 |
| 42 | + charamel.ISO885915, // ISO-8859-15 |
| 43 | + charamel.ISO885916, // ISO-8859-16 |
| 44 | + |
| 45 | + // Windows编码 (CP1250-1258) |
| 46 | + charamel.CP1250, |
| 47 | + charamel.CP1251, |
| 48 | + charamel.CP1252, |
| 49 | + charamel.CP1253, |
| 50 | + charamel.CP1254, |
| 51 | + charamel.CP1255, |
| 52 | + charamel.CP1256, |
| 53 | + charamel.CP1257, |
| 54 | + charamel.CP1258, |
| 55 | + |
| 56 | + // 中文编码 |
| 57 | + charamel.GB2312, |
| 58 | + charamel.GBK, |
| 59 | + charamel.GB18030, |
| 60 | + charamel.BIG5, |
| 61 | + charamel.BIG5HKSCS, |
| 62 | + charamel.HZ, // hz-gb-2312 |
| 63 | + |
| 64 | + // 日文编码 |
| 65 | + charamel.EUCJP, // euc-jp |
| 66 | + charamel.SHIFTJIS, // shift_jis |
| 67 | + charamel.ISO2022JP, // iso-2022-jp |
| 68 | + |
| 69 | + // 韩文编码 |
| 70 | + charamel.EUCKR, // euc-kr |
| 71 | + charamel.CP949, // windows-949 |
| 72 | + charamel.ISO2022KR, // iso-2022-kr |
| 73 | + |
| 74 | + // 俄文编码 |
| 75 | + charamel.KOI8R, // koi8-r |
| 76 | + charamel.KOI8U, // koi8-u |
| 77 | + charamel.CP866, // cp866 |
| 78 | + |
| 79 | + // 泰文编码 |
| 80 | + charamel.TIS620, // tis-620 |
| 81 | + charamel.CP874, // windows-874 |
| 82 | + |
| 83 | + // Mac编码 |
| 84 | + charamel.MACROMAN, // macintosh |
| 85 | + charamel.MACCYRILLIC, // x-mac-cyrillic |
| 86 | +} |
| 87 | + |
| 88 | +func DetectEncodingByCharamel(dat []byte) (string, error) { |
| 89 | + d, err := charamel.NewDetector(encodings, 0) |
| 90 | + if err != nil { |
| 91 | + return "", err |
| 92 | + } |
| 93 | + v := d.Detect(dat) |
| 94 | + if v == nil { |
| 95 | + return "", errors.New("detect failed by github.com/gonejack/charamel") |
| 96 | + } |
| 97 | + return mapName(v), nil |
| 98 | +} |
| 99 | + |
| 100 | +func mapName(encoding *charamel.Encoding) string { |
| 101 | + switch *encoding { |
| 102 | + // UTF编码 |
| 103 | + case charamel.UTF8: |
| 104 | + return "utf-8" |
| 105 | + case charamel.UTF16: |
| 106 | + return "utf-16" |
| 107 | + case charamel.UTF16BE: |
| 108 | + return "utf-16be" |
| 109 | + case charamel.UTF16LE: |
| 110 | + return "utf-16le" |
| 111 | + case charamel.UTF32: |
| 112 | + return "utf-32" |
| 113 | + case charamel.UTF32BE: |
| 114 | + return "utf-32be" |
| 115 | + case charamel.UTF32LE: |
| 116 | + return "utf-32le" |
| 117 | + |
| 118 | + // ASCII |
| 119 | + case charamel.ASCII: |
| 120 | + return "ascii" |
| 121 | + |
| 122 | + // 西欧编码 (ISO-8859) |
| 123 | + case charamel.LATIN1: |
| 124 | + return "iso-8859-1" |
| 125 | + case charamel.ISO88592: |
| 126 | + return "iso-8859-2" |
| 127 | + case charamel.ISO88593: |
| 128 | + return "iso-8859-3" |
| 129 | + case charamel.ISO88594: |
| 130 | + return "iso-8859-4" |
| 131 | + case charamel.ISO88595: |
| 132 | + return "iso-8859-5" |
| 133 | + case charamel.ISO88596: |
| 134 | + return "iso-8859-6" |
| 135 | + case charamel.ISO88597: |
| 136 | + return "iso-8859-7" |
| 137 | + case charamel.ISO88598: |
| 138 | + return "iso-8859-8" |
| 139 | + case charamel.ISO88599: |
| 140 | + return "iso-8859-9" |
| 141 | + case charamel.ISO885910: |
| 142 | + return "iso-8859-10" |
| 143 | + case charamel.ISO885911: |
| 144 | + return "iso-8859-11" |
| 145 | + case charamel.ISO885913: |
| 146 | + return "iso-8859-13" |
| 147 | + case charamel.ISO885914: |
| 148 | + return "iso-8859-14" |
| 149 | + case charamel.ISO885915: |
| 150 | + return "iso-8859-15" |
| 151 | + case charamel.ISO885916: |
| 152 | + return "iso-8859-16" |
| 153 | + |
| 154 | + // Windows编码 |
| 155 | + case charamel.CP1250: |
| 156 | + return "cp1250" |
| 157 | + case charamel.CP1251: |
| 158 | + return "cp1251" |
| 159 | + case charamel.CP1252: |
| 160 | + return "cp1252" |
| 161 | + case charamel.CP1253: |
| 162 | + return "cp1253" |
| 163 | + case charamel.CP1254: |
| 164 | + return "cp1254" |
| 165 | + case charamel.CP1255: |
| 166 | + return "cp1255" |
| 167 | + case charamel.CP1256: |
| 168 | + return "cp1256" |
| 169 | + case charamel.CP1257: |
| 170 | + return "cp1257" |
| 171 | + case charamel.CP1258: |
| 172 | + return "cp1258" |
| 173 | + |
| 174 | + // 中文编码 |
| 175 | + case charamel.GB2312: |
| 176 | + return "gb2312" |
| 177 | + case charamel.GBK: |
| 178 | + return "gbk" |
| 179 | + case charamel.GB18030: |
| 180 | + return "gb18030" |
| 181 | + case charamel.BIG5: |
| 182 | + return "big5" |
| 183 | + case charamel.BIG5HKSCS: |
| 184 | + return "big5-hkscs" |
| 185 | + case charamel.HZ: |
| 186 | + return "hz-gb-2312" |
| 187 | + |
| 188 | + // 日文编码 |
| 189 | + case charamel.EUCJP: |
| 190 | + return "euc-jp" |
| 191 | + case charamel.SHIFTJIS: |
| 192 | + return "shift-jis" |
| 193 | + case charamel.ISO2022JP: |
| 194 | + return "iso-2022-jp" |
| 195 | + |
| 196 | + // 韩文编码 |
| 197 | + case charamel.EUCKR: |
| 198 | + return "euc-kr" |
| 199 | + case charamel.CP949: |
| 200 | + return "windows-949" |
| 201 | + case charamel.ISO2022KR: |
| 202 | + return "iso-2022-kr" |
| 203 | + |
| 204 | + // 俄文编码 |
| 205 | + case charamel.KOI8R: |
| 206 | + return "koi8-r" |
| 207 | + case charamel.KOI8U: |
| 208 | + return "koi8-u" |
| 209 | + case charamel.CP866: |
| 210 | + return "cp866" |
| 211 | + |
| 212 | + // 泰文编码 |
| 213 | + case charamel.TIS620: |
| 214 | + return "tis-620" |
| 215 | + case charamel.CP874: |
| 216 | + return "windows-874" |
| 217 | + |
| 218 | + // Mac编码 |
| 219 | + case charamel.MACROMAN: |
| 220 | + return "macintosh" |
| 221 | + case charamel.MACCYRILLIC: |
| 222 | + return "x-mac-cyrillic" |
| 223 | + |
| 224 | + // 默认返回原始字符串 |
| 225 | + default: |
| 226 | + return encoding.String() |
| 227 | + } |
| 228 | +} |
0 commit comments