| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120 |
- from faster_whisper import WhisperModel
- from faster_whisper.tokenizer import Tokenizer
- from faster_whisper.transcribe import get_suppressed_tokens
- def test_suppressed_tokens_minus_1():
- model = WhisperModel("tiny.en")
- tokenizer = Tokenizer(model.hf_tokenizer, False)
- tokens = get_suppressed_tokens(tokenizer, [-1])
- assert tokens == (
- 1,
- 2,
- 7,
- 8,
- 9,
- 10,
- 14,
- 25,
- 26,
- 27,
- 28,
- 29,
- 31,
- 58,
- 59,
- 60,
- 61,
- 62,
- 63,
- 90,
- 91,
- 92,
- 93,
- 357,
- 366,
- 438,
- 532,
- 685,
- 705,
- 796,
- 930,
- 1058,
- 1220,
- 1267,
- 1279,
- 1303,
- 1343,
- 1377,
- 1391,
- 1635,
- 1782,
- 1875,
- 2162,
- 2361,
- 2488,
- 3467,
- 4008,
- 4211,
- 4600,
- 4808,
- 5299,
- 5855,
- 6329,
- 7203,
- 9609,
- 9959,
- 10563,
- 10786,
- 11420,
- 11709,
- 11907,
- 13163,
- 13697,
- 13700,
- 14808,
- 15306,
- 16410,
- 16791,
- 17992,
- 19203,
- 19510,
- 20724,
- 22305,
- 22935,
- 27007,
- 30109,
- 30420,
- 33409,
- 34949,
- 40283,
- 40493,
- 40549,
- 47282,
- 49146,
- 50257,
- 50357,
- 50358,
- 50359,
- 50360,
- )
- def test_suppressed_tokens_minus_value():
- model = WhisperModel("tiny.en")
- tokenizer = Tokenizer(model.hf_tokenizer, False)
- tokens = get_suppressed_tokens(tokenizer, [13])
- assert tokens == (13, 50257, 50357, 50358, 50359, 50360)
- def test_split_on_unicode():
- model = WhisperModel("tiny")
- tokenizer = Tokenizer(model.hf_tokenizer, False)
- tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
- words, word_tokens = tokenizer.split_tokens_on_unicode(tokens)
- assert words == [" elle", " est", " l", "'", "\ufffd", "é", "rit", "oire"]
- assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]
|