16
16
// helpers
17
17
//
18
18
19
- static void replace_all (std::string & s, const std::string & search, const std::string & replace) {
20
- std::string result;
21
- for (size_t pos = 0 ; ; pos += search.length ()) {
22
- auto new_pos = s.find (search, pos);
23
- if (new_pos == std::string::npos) {
24
- result += s.substr (pos, s.size () - pos);
25
- break ;
26
- }
27
- result += s.substr (pos, new_pos - pos) + replace;
28
- pos = new_pos;
29
- }
30
- s = std::move (result);
31
- }
32
-
33
19
LLAMA_ATTRIBUTE_FORMAT (1 , 2 )
34
20
static std::string format(const char * fmt, ...) {
35
21
va_list ap;
@@ -335,6 +321,21 @@ struct llm_tokenizer_spm {
335
321
336
322
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
337
323
324
+ template <typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
325
+ class llama_priority_queue : public std ::priority_queue<T, Container, Compare> {
326
+ public:
327
+ using std::priority_queue<T, Container, Compare>::priority_queue;
328
+
329
+ T pop_move () {
330
+ T item = std::move (this ->c .front ());
331
+ std::pop_heap (this ->c .begin (), this ->c .end (), this ->comp );
332
+ this ->c .pop_back ();
333
+ return item;
334
+ }
335
+
336
+ void pop () = delete;
337
+ };
338
+
338
339
struct llm_bigram_bpe {
339
340
struct comparator {
340
341
bool operator ()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
@@ -343,7 +344,7 @@ struct llm_bigram_bpe {
343
344
};
344
345
345
346
using queue_storage = std::vector<llm_bigram_bpe>;
346
- using queue = std::priority_queue <llm_bigram_bpe, queue_storage, comparator>;
347
+ using queue = llama_priority_queue <llm_bigram_bpe, queue_storage, comparator>;
347
348
llm_symbol::index left;
348
349
llm_symbol::index right;
349
350
std::string text;
@@ -402,6 +403,7 @@ struct llm_tokenizer_bpe {
402
403
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
403
404
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
404
405
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
406
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE:
405
407
regex_exprs = {
406
408
" \\ p{N}" ,
407
409
" 's|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)" ,
@@ -424,6 +426,8 @@ struct llm_tokenizer_bpe {
424
426
};
425
427
break ;
426
428
case LLAMA_VOCAB_PRE_TYPE_PORO:
429
+ case LLAMA_VOCAB_PRE_TYPE_BLOOM:
430
+ case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
427
431
regex_exprs = {
428
432
" ?[^(\\ s|.,!?…。,、।۔،)]+" ,
429
433
};
@@ -531,8 +535,7 @@ struct llm_tokenizer_bpe {
531
535
532
536
// build token(s)
533
537
while (!work_queue.empty ()) {
534
- auto bigram = work_queue.top ();
535
- work_queue.pop ();
538
+ auto bigram = work_queue.pop_move ();
536
539
537
540
auto & left_symbol = symbols[bigram.left ];
538
541
auto & right_symbol = symbols[bigram.right ];
@@ -1480,11 +1483,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
1480
1483
return vocab.special_pad_id ;
1481
1484
}
1482
1485
1483
- int32_t llama_add_bos_token_impl (const struct llama_vocab & vocab) {
1486
+ bool llama_add_bos_token_impl (const struct llama_vocab & vocab) {
1484
1487
return vocab.tokenizer_add_bos ;
1485
1488
}
1486
1489
1487
- int32_t llama_add_eos_token_impl (const struct llama_vocab & vocab) {
1490
+ bool llama_add_eos_token_impl (const struct llama_vocab & vocab) {
1488
1491
return vocab.tokenizer_add_eos ;
1489
1492
}
1490
1493
0 commit comments