I'm comparing the output of this library with the output of python and I see a difference by 1 in the token ids
I can decode the python tokens in rust when I subtract 1 from the token ids:
fn main() {
use sentencepiece::SentencePieceProcessor;
let spp_model_path = "path_to/sentencepiece.bpe.model"; #from the same hf repo as the model above:
let spp = SentencePieceProcessor::open(spp_model_path).unwrap();
let article = "English (`eng_Latn`) is set as the default language from which to translate. In order to specify that you'd like to translate from a different language, you should specify the BCP-47 code in the `src_lang` keyword argument of the tokenizer initialization.";
let pieces = spp
.encode(article)
.unwrap()
.into_iter()
.map(|p| p.id)
.collect::<Vec<_>>();
println!("{:?}", &pieces);
let result = spp.decode_piece_ids(&pieces);
println!("{:?}", result);
let pieces_rust = vec![
30310, 103, 253989, 179, 248119, 68423, 248062, 253989, 248160, 247, 2635, 387, 348,
179662, 65444, 5056, 9088, 201, 3291, 28063, 248074, 716, 22755, 201, 10410, 8161, 1481,
1258, 248115, 248071, 6398, 201, 3291, 28063, 5056, 8, 30157, 65444, 248078, 1258, 12515,
10410, 8161, 348, 113, 29132, 7553, 248282, 44777, 107, 348, 248058, 253989, 84411, 248119,
7496, 253989, 22659, 50548, 37491, 451, 348, 1775, 429, 2500, 107, 21533, 117079, 248074,
];
let pieces_python = vec![
256047, 30311, 104, 253990, 256047, 248059, 253990, 2481, 61, 248, 2636, 388, 349, 179663,
65445, 5057, 9089, 202, 3292, 28064, 248075, 717, 22756, 202, 10411, 8162, 1482, 1259,
248116, 248072, 6399, 202, 3292, 28064, 5057, 9, 30158, 65445, 248079, 1259, 12516, 10411,
8162, 349, 114, 29133, 7554, 248283, 44778, 108, 349, 248059, 253990, 84412, 248120, 7497,
253990, 22660, 50549, 37492, 452, 349, 1776, 430, 2501, 108, 21534, 117080, 248075, 2,
]
.iter()
.filter(|p| p.to_owned().to_owned() < 256040)
.map(|p| p.to_owned() - 1)
.collect::<Vec<u32>>();
let result_rust = spp.decode_piece_ids(&pieces_rust);
println!("{:?}", result_rust);
let result_py = spp.decode_piece_ids(&pieces_python);
println!("{:?}", result_py);
}