test_tokenizer.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. from faster_whisper import WhisperModel
  2. from faster_whisper.tokenizer import Tokenizer
  3. from faster_whisper.transcribe import get_suppressed_tokens
  4. def test_suppressed_tokens_minus_1():
  5. model = WhisperModel("tiny.en")
  6. tokenizer = Tokenizer(model.hf_tokenizer, False)
  7. tokens = get_suppressed_tokens(tokenizer, [-1])
  8. assert tokens == (
  9. 1,
  10. 2,
  11. 7,
  12. 8,
  13. 9,
  14. 10,
  15. 14,
  16. 25,
  17. 26,
  18. 27,
  19. 28,
  20. 29,
  21. 31,
  22. 58,
  23. 59,
  24. 60,
  25. 61,
  26. 62,
  27. 63,
  28. 90,
  29. 91,
  30. 92,
  31. 93,
  32. 357,
  33. 366,
  34. 438,
  35. 532,
  36. 685,
  37. 705,
  38. 796,
  39. 930,
  40. 1058,
  41. 1220,
  42. 1267,
  43. 1279,
  44. 1303,
  45. 1343,
  46. 1377,
  47. 1391,
  48. 1635,
  49. 1782,
  50. 1875,
  51. 2162,
  52. 2361,
  53. 2488,
  54. 3467,
  55. 4008,
  56. 4211,
  57. 4600,
  58. 4808,
  59. 5299,
  60. 5855,
  61. 6329,
  62. 7203,
  63. 9609,
  64. 9959,
  65. 10563,
  66. 10786,
  67. 11420,
  68. 11709,
  69. 11907,
  70. 13163,
  71. 13697,
  72. 13700,
  73. 14808,
  74. 15306,
  75. 16410,
  76. 16791,
  77. 17992,
  78. 19203,
  79. 19510,
  80. 20724,
  81. 22305,
  82. 22935,
  83. 27007,
  84. 30109,
  85. 30420,
  86. 33409,
  87. 34949,
  88. 40283,
  89. 40493,
  90. 40549,
  91. 47282,
  92. 49146,
  93. 50257,
  94. 50357,
  95. 50358,
  96. 50359,
  97. 50360,
  98. )
  99. def test_suppressed_tokens_minus_value():
  100. model = WhisperModel("tiny.en")
  101. tokenizer = Tokenizer(model.hf_tokenizer, False)
  102. tokens = get_suppressed_tokens(tokenizer, [13])
  103. assert tokens == (13, 50257, 50357, 50358, 50359, 50360)
  104. def test_split_on_unicode():
  105. model = WhisperModel("tiny")
  106. tokenizer = Tokenizer(model.hf_tokenizer, False)
  107. tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
  108. words, word_tokens = tokenizer.split_tokens_on_unicode(tokens)
  109. assert words == [" elle", " est", " l", "'", "\ufffd", "é", "rit", "oire"]
  110. assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]