from __future__ import absolute_import import logging import time from datetime import timedelta from requests import Session from requests.exceptions import JSONDecodeError from subliminal_patch.subtitle import Subtitle from subliminal_patch.providers import Provider from subliminal import __short_version__ from subliminal.exceptions import ConfigurationError from subzero.language import Language from subliminal.video import Episode, Movie from babelfish.exceptions import LanguageReverseError import ffmpeg import functools from pycountry import languages # These are all the languages Whisper supports. # from whisper.tokenizer import LANGUAGES whisper_languages = { "en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian", "ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish", "pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish", "it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese", "he": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech", "ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian", "th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian", "la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak", "te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian", "az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian", "mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian", "ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian", "sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala", "km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans", "oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi", "gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek", "fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk", "mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan", "tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian", "ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese", } logger = logging.getLogger(__name__) def set_log_level(newLevel="INFO"): newLevel = newLevel.upper() # print(f'WhisperAI log level changing from {logging._levelToName[logger.getEffectiveLevel()]} to {newLevel}') logger.setLevel(getattr(logging, newLevel)) # initialize to default above set_log_level() # ffmpeg uses the older ISO 639-2 code when extracting audio streams based on language # if we give it the newer ISO 639-3 code it can't find that audio stream by name because it's different # for example it wants 'ger' instead of 'deu' for the German language # or 'fre' instead of 'fra' for the French language def get_ISO_639_2_code(iso639_3_code): # find the language using ISO 639-3 code language = languages.get(alpha_3=iso639_3_code) # get the ISO 639-2 code or use the original input if there isn't a match iso639_2_code = language.bibliographic if language and hasattr(language, 'bibliographic') else iso639_3_code logger.debug(f"ffmpeg using language code '{iso639_2_code}' (instead of '{iso639_3_code}')") return iso639_2_code @functools.lru_cache(2) def encode_audio_stream(path, ffmpeg_path, audio_stream_language=None): logger.debug("Encoding audio stream to WAV with ffmpeg") try: # This launches a subprocess to decode audio while down-mixing and resampling as necessary. inp = ffmpeg.input(path, threads=0) if audio_stream_language: # There is more than one audio stream, so pick the requested one by name # Use the ISO 639-2 code if available audio_stream_language = get_ISO_639_2_code(audio_stream_language) logger.debug(f"Whisper will use the '{audio_stream_language}' audio stream for {path}") inp = inp[f'a:m:language:{audio_stream_language}'] out, _ = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000, af="aresample=async=1") \ .run(cmd=[ffmpeg_path, "-nostdin"], capture_stdout=True, capture_stderr=True) except ffmpeg.Error as e: logger.warning(f"ffmpeg failed to load audio: {e.stderr.decode()}") return None logger.debug(f"Finished encoding audio stream in {path} with no errors") return out def whisper_get_language(code, name): # Whisper uses an inconsistent mix of alpha2 and alpha3 language codes try: return Language.fromalpha2(code) except LanguageReverseError: return Language.fromname(name) def whisper_get_language_reverse(alpha3): # Returns the whisper language code given an alpha3b language for wl in whisper_languages: lan = whisper_get_language(wl, whisper_languages[wl]) if lan.alpha3 == alpha3: return wl return None def language_from_alpha3(lang): name = Language(lang).name return name class WhisperAISubtitle(Subtitle): '''Whisper AI Subtitle.''' provider_name = 'whisperai' hash_verifiable = False def __init__(self, language, video): super(WhisperAISubtitle, self).__init__(language) self.video = video self.task = None self.audio_language = None self.force_audio_stream = None @property def id(self): # Construct unique id otherwise provider pool will think # subtitles are all the same and drop all except the first one # This is important for language profiles with more than one language return f"{self.video.original_name}_{self.task}_{str(self.language)}" def get_matches(self, video): matches = set() if isinstance(video, Episode): matches.update(["series", "season", "episode"]) elif isinstance(video, Movie): matches.update(["title"]) return matches class WhisperAIProvider(Provider): '''Whisper AI Provider.''' languages = set() for lan in whisper_languages: languages.update({whisper_get_language(lan, whisper_languages[lan])}) video_types = (Episode, Movie) def __init__(self, endpoint=None, response=None, timeout=None, ffmpeg_path=None, loglevel=None): set_log_level(loglevel) if not endpoint: raise ConfigurationError('Whisper Web Service Endpoint must be provided') if not response: raise ConfigurationError('Whisper Web Service Connection/response timeout must be provided') if not timeout: raise ConfigurationError('Whisper Web Service Transcription/translation timeout must be provided') if not ffmpeg_path: raise ConfigurationError("ffmpeg path must be provided") self.endpoint = endpoint.rstrip("/") self.response = int(response) self.timeout = int(timeout) self.session = None self.ffmpeg_path = ffmpeg_path def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ def terminate(self): self.session.close() @functools.lru_cache(2048) def detect_language(self, path) -> Language: out = encode_audio_stream(path, self.ffmpeg_path) if out == None: logger.info(f"Whisper cannot detect language of {path} because of missing/bad audio track") return None r = self.session.post(f"{self.endpoint}/detect-language", params={'encode': 'false'}, files={'audio_file': out}, timeout=(self.response, self.timeout)) try: results = r.json() except JSONDecodeError: results = {} if len(results) == 0: logger.info(f"Whisper returned empty response when detecting language") return None logger.debug(f"Whisper detected language of {path} as {results['detected_language']}") return whisper_get_language(results["language_code"], results["detected_language"]) def query(self, language, video): if language not in self.languages: return None sub = WhisperAISubtitle(language, video) sub.task = "transcribe" if video.audio_languages and not (list(video.audio_languages)[0] == "und" and len(video.audio_languages) == 1): if language.alpha3 in video.audio_languages: sub.audio_language = language.alpha3 if len(list(video.audio_languages)) > 1: sub.force_audio_stream = language.alpha3 else: sub.task = "translate" eligible_languages = list(video.audio_languages) if len(eligible_languages) > 1: if "und" in eligible_languages: eligible_languages.remove("und") sub.audio_language = eligible_languages[0] else: # We must detect the language manually detected_lang = self.detect_language(video.original_path) if detected_lang == None: sub.task = "error" # tell the user what is wrong sub.release_info = "bad/missing audio track - cannot transcribe" return sub if detected_lang != language: sub.task = "translate" sub.audio_language = detected_lang.alpha3 if sub.task == "translate": if language.alpha3 != "eng": logger.debug(f"Translation only possible from {language} to English") return None # tell the user what we are about to do sub.release_info = f"{sub.task} {language_from_alpha3(sub.audio_language)} audio -> {language_from_alpha3(language.alpha3)} SRT" logger.debug(f"Whisper query: ({video.original_path}): {sub.audio_language} -> {language.alpha3} [TASK: {sub.task}]") return sub def list_subtitles(self, video, languages): subtitles = [self.query(l, video) for l in languages] return [s for s in subtitles if s is not None] def download_subtitle(self, subtitle: WhisperAISubtitle): # Invoke Whisper through the API. This may take a long time depending on the file. # TODO: This loads the entire file into memory, find a good way to stream the file in chunks out = None if subtitle.task != "error": out = encode_audio_stream(subtitle.video.original_path, self.ffmpeg_path, subtitle.force_audio_stream) if out == None: logger.info(f"Whisper cannot process {subtitle.video.original_path} because of missing/bad audio track") subtitle.content = None return logger.debug(f'Audio stream length (in WAV format) is {len(out):,} bytes') if subtitle.task == "transcribe": output_language = subtitle.audio_language else: output_language = "eng" input_language = whisper_get_language_reverse(subtitle.audio_language) if input_language is None: if output_language == "eng": # guess that audio track is mislabelled English and let whisper try to transcribe it input_language = "en" subtitle.task = "transcribe" logger.info(f"Whisper treating unsupported audio track language: '{subtitle.audio_language}' as English") else: logger.info(f"Whisper cannot process {subtitle.video.original_path} because of unsupported audio track language: '{subtitle.audio_language}'") subtitle.content = None return logger.info(f'Starting WhisperAI {subtitle.task} to {language_from_alpha3(output_language)} for {subtitle.video.original_path}') startTime = time.time() r = self.session.post(f"{self.endpoint}/asr", params={'task': subtitle.task, 'language': input_language, 'output': 'srt', 'encode': 'false'}, files={'audio_file': out}, timeout=(self.response, self.timeout)) endTime = time.time() elapsedTime = timedelta(seconds=round(endTime - startTime)) # for debugging, log if anything got returned subtitle_length = len(r.content) logger.debug(f'Returned subtitle length is {subtitle_length:,} bytes') subtitle_length = min(subtitle_length, 1000) if subtitle_length > 0: logger.debug(f'First {subtitle_length} bytes of subtitle: {r.content[0:subtitle_length]}') logger.info(f'Completed WhisperAI {subtitle.task} to {language_from_alpha3(output_language)} in {elapsedTime} for {subtitle.video.original_path}') subtitle.content = r.content