| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271 |
- import inspect
- import os
- import numpy as np
- from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio
- def test_supported_languages():
- model = WhisperModel("tiny.en")
- assert model.supported_languages == ["en"]
- def test_transcribe(jfk_path):
- model = WhisperModel("tiny")
- segments, info = model.transcribe(jfk_path, word_timestamps=True)
- assert info.all_language_probs is not None
- assert info.language == "en"
- assert info.language_probability > 0.9
- assert info.duration == 11
- # Get top language info from all results, which should match the
- # already existing metadata
- top_lang, top_lang_score = info.all_language_probs[0]
- assert info.language == top_lang
- assert abs(info.language_probability - top_lang_score) < 1e-16
- segments = list(segments)
- assert len(segments) == 1
- segment = segments[0]
- assert segment.text == (
- " And so my fellow Americans, ask not what your country can do for you, "
- "ask what you can do for your country."
- )
- assert segment.text == "".join(word.word for word in segment.words)
- assert segment.start == segment.words[0].start
- assert segment.end == segment.words[-1].end
- batched_model = BatchedInferencePipeline(model=model)
- result, info = batched_model.transcribe(
- jfk_path, word_timestamps=True, vad_filter=False
- )
- assert info.language == "en"
- assert info.language_probability > 0.7
- segments = []
- for segment in result:
- segments.append(
- {"start": segment.start, "end": segment.end, "text": segment.text}
- )
- assert len(segments) == 1
- assert segment.text == (
- " And so my fellow Americans ask not what your country can do for you, "
- "ask what you can do for your country."
- )
- def test_batched_transcribe(physcisworks_path):
- model = WhisperModel("tiny")
- batched_model = BatchedInferencePipeline(model=model)
- result, info = batched_model.transcribe(physcisworks_path, batch_size=16)
- assert info.language == "en"
- assert info.language_probability > 0.7
- segments = []
- for segment in result:
- segments.append(
- {"start": segment.start, "end": segment.end, "text": segment.text}
- )
- # number of near 30 sec segments
- assert len(segments) == 7
- result, info = batched_model.transcribe(
- physcisworks_path,
- batch_size=16,
- without_timestamps=False,
- word_timestamps=True,
- )
- segments = []
- for segment in result:
- assert segment.words is not None
- segments.append(
- {"start": segment.start, "end": segment.end, "text": segment.text}
- )
- assert len(segments) > 7
- def test_empty_audio():
- audio = np.asarray([], dtype="float32")
- model = WhisperModel("tiny")
- pipeline = BatchedInferencePipeline(model=model)
- assert list(model.transcribe(audio)[0]) == []
- assert list(pipeline.transcribe(audio)[0]) == []
- model.detect_language(audio)
- def test_prefix_with_timestamps(jfk_path):
- model = WhisperModel("tiny")
- segments, _ = model.transcribe(jfk_path, prefix="And so my fellow Americans")
- segments = list(segments)
- assert len(segments) == 1
- segment = segments[0]
- assert segment.text == (
- " And so my fellow Americans, ask not what your country can do for you, "
- "ask what you can do for your country."
- )
- assert segment.start == 0
- assert 10 < segment.end <= 11
- def test_vad(jfk_path):
- model = WhisperModel("tiny")
- segments, info = model.transcribe(
- jfk_path,
- vad_filter=True,
- vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
- )
- segments = list(segments)
- assert len(segments) == 1
- segment = segments[0]
- assert segment.text == (
- " And so my fellow Americans ask not what your country can do for you, "
- "ask what you can do for your country."
- )
- assert 0 < segment.start < 1
- assert 10 < segment.end < 11
- assert info.vad_options.min_silence_duration_ms == 500
- assert info.vad_options.speech_pad_ms == 200
- def test_stereo_diarization(data_dir):
- model = WhisperModel("tiny")
- audio_path = os.path.join(data_dir, "stereo_diarization.wav")
- left, right = decode_audio(audio_path, split_stereo=True)
- segments, _ = model.transcribe(left)
- transcription = "".join(segment.text for segment in segments).strip()
- assert transcription == (
- "He began a confused complaint against the wizard, "
- "who had vanished behind the curtain on the left."
- )
- segments, _ = model.transcribe(right)
- transcription = "".join(segment.text for segment in segments).strip()
- assert transcription == "The horizon seems extremely distant."
- def test_multilingual_transcription(data_dir):
- model = WhisperModel("tiny")
- pipeline = BatchedInferencePipeline(model)
- audio_path = os.path.join(data_dir, "multilingual.mp3")
- audio = decode_audio(audio_path)
- segments, info = model.transcribe(
- audio,
- multilingual=True,
- without_timestamps=True,
- condition_on_previous_text=False,
- )
- segments = list(segments)
- assert (
- segments[0].text
- == " Permission is hereby granted, free of charge, to any person obtaining a copy of the"
- " software and associated documentation files to deal in the software without restriction,"
- " including without limitation the rights to use, copy, modify, merge, publish, distribute"
- ", sublicence, and or cell copies of the software, and to permit persons to whom the "
- "software is furnished to do so, subject to the following conditions. The above copyright"
- " notice and this permission notice, shall be included in all copies or substantial "
- "portions of the software."
- )
- assert (
- segments[1].text
- == " Jedem, der dieses Software und die dazu gehöregen Dokumentationsdatein erhält, wird "
- "hiermit unengeltlich die Genehmigung erteilt, wird der Software und eingeschränkt zu "
- "verfahren. Dies umfasst insbesondere das Recht, die Software zu verwenden, zu "
- "vervielfältigen, zu modifizieren, zu Samenzofügen, zu veröffentlichen, zu verteilen, "
- "unterzulizenzieren und oder kopieren der Software zu verkaufen und diese Rechte "
- "unterfolgen den Bedingungen anderen zu übertragen."
- )
- segments, info = pipeline.transcribe(audio, multilingual=True)
- segments = list(segments)
- assert (
- segments[0].text
- == " Permission is hereby granted, free of charge, to any person obtaining a copy of the"
- " software and associated documentation files to deal in the software without restriction,"
- " including without limitation the rights to use, copy, modify, merge, publish, distribute"
- ", sublicence, and or cell copies of the software, and to permit persons to whom the "
- "software is furnished to do so, subject to the following conditions. The above copyright"
- " notice and this permission notice, shall be included in all copies or substantial "
- "portions of the software."
- )
- assert (
- "Dokumentationsdatein erhält, wird hiermit unengeltlich die Genehmigung erteilt,"
- " wird der Software und eingeschränkt zu verfahren. Dies umfasst insbesondere das Recht,"
- " die Software zu verwenden, zu vervielfältigen, zu modifizieren"
- in segments[1].text
- )
- def test_hotwords(data_dir):
- model = WhisperModel("tiny")
- pipeline = BatchedInferencePipeline(model)
- audio_path = os.path.join(data_dir, "hotwords.mp3")
- audio = decode_audio(audio_path)
- segments, info = model.transcribe(audio, hotwords="ComfyUI")
- segments = list(segments)
- assert "ComfyUI" in segments[0].text
- assert info.transcription_options.hotwords == "ComfyUI"
- segments, info = pipeline.transcribe(audio, hotwords="ComfyUI")
- segments = list(segments)
- assert "ComfyUI" in segments[0].text
- assert info.transcription_options.hotwords == "ComfyUI"
- def test_transcribe_signature():
- model_transcribe_args = set(inspect.getargs(WhisperModel.transcribe.__code__).args)
- pipeline_transcribe_args = set(
- inspect.getargs(BatchedInferencePipeline.transcribe.__code__).args
- )
- pipeline_transcribe_args.remove("batch_size")
- assert model_transcribe_args == pipeline_transcribe_args
- def test_monotonic_timestamps(physcisworks_path):
- model = WhisperModel("tiny")
- pipeline = BatchedInferencePipeline(model=model)
- segments, info = model.transcribe(physcisworks_path, word_timestamps=True)
- segments = list(segments)
- for i in range(len(segments) - 1):
- assert segments[i].start <= segments[i].end
- assert segments[i].end <= segments[i + 1].start
- for word in segments[i].words:
- assert word.start <= word.end
- assert word.end <= segments[i].end
- assert segments[-1].end <= info.duration
- segments, info = pipeline.transcribe(physcisworks_path, word_timestamps=True)
- segments = list(segments)
- for i in range(len(segments) - 1):
- assert segments[i].start <= segments[i].end
- assert segments[i].end <= segments[i + 1].start
- for word in segments[i].words:
- assert word.start <= word.end
- assert word.end <= segments[i].end
- assert segments[-1].end <= info.duration
|