From 8b1db07e9f984f66df669d7b5a31868c9ef48530 Mon Sep 17 00:00:00 2001 From: Alex Yancey Date: Wed, 5 Apr 2023 11:43:24 -0700 Subject: [PATCH] Added OpenAI Whisper Provider --- bazarr/app/config.py | 4 + bazarr/app/get_providers.py | 4 + frontend/src/pages/Settings/Providers/list.ts | 19 ++ libs/subliminal_patch/providers/whisperai.py | 286 ++++++++++++++++++ 4 files changed, 313 insertions(+) create mode 100644 libs/subliminal_patch/providers/whisperai.py diff --git a/bazarr/app/config.py b/bazarr/app/config.py index 476640db8..acf5e9104 100644 --- a/bazarr/app/config.py +++ b/bazarr/app/config.py @@ -170,6 +170,10 @@ defaults = { 'subf2m': { 'verify_ssl': 'True' }, + 'whisperai': { + 'endpoint': 'http://127.0.0.1:9000', + 'timeout': '3600' + }, 'legendasdivx': { 'username': '', 'password': '', diff --git a/bazarr/app/get_providers.py b/bazarr/app/get_providers.py index d73c109b6..2ea396a1d 100644 --- a/bazarr/app/get_providers.py +++ b/bazarr/app/get_providers.py @@ -252,6 +252,10 @@ def get_providers_auth(): 'subf2m': { 'verify_ssl': settings.subf2m.getboolean('verify_ssl') }, + 'whisperai': { + 'endpoint': settings.whisperai.endpoint, + 'timeout': settings.whisperai.timeout + } } diff --git a/frontend/src/pages/Settings/Providers/list.ts b/frontend/src/pages/Settings/Providers/list.ts index 3ecc6097d..780258764 100644 --- a/frontend/src/pages/Settings/Providers/list.ts +++ b/frontend/src/pages/Settings/Providers/list.ts @@ -206,6 +206,25 @@ export const ProviderList: Readonly = [ ], }, { key: "napiprojekt", description: "Polish Subtitles Provider" }, + { + key: "whisperai", + name: "Whisper", + description: "AI Generated Subtitles powered by Whisper", + inputs: [ + { + type: "text", + key: "endpoint", + defaultValue: "http://127.0.0.1:9000", + name: "Whisper ASR Docker Endpoint", + }, + { + type: "text", + key: "timeout", + defaultValue: 3600, + name: "Transcription/translation timeout in seconds", + }, + ], + }, { key: "napisy24", description: "Polish Subtitles Provider", diff --git a/libs/subliminal_patch/providers/whisperai.py b/libs/subliminal_patch/providers/whisperai.py new file mode 100644 index 000000000..6e7df7723 --- /dev/null +++ b/libs/subliminal_patch/providers/whisperai.py @@ -0,0 +1,286 @@ +from __future__ import absolute_import +import logging + +from requests import Session + +from subliminal_patch.subtitle import Subtitle +from subliminal_patch.providers import Provider +from subliminal import __short_version__ +from subliminal.exceptions import ConfigurationError +from subzero.language import Language +from subliminal.video import Episode, Movie + +from babelfish.exceptions import LanguageReverseError + +import ffmpeg +import functools + +# These are all the languages Whisper supports. +# from whisper.tokenizer import LANGUAGES + +whisper_languages = { + "en": "english", + "zh": "chinese", + "de": "german", + "es": "spanish", + "ru": "russian", + "ko": "korean", + "fr": "french", + "ja": "japanese", + "pt": "portuguese", + "tr": "turkish", + "pl": "polish", + "ca": "catalan", + "nl": "dutch", + "ar": "arabic", + "sv": "swedish", + "it": "italian", + "id": "indonesian", + "hi": "hindi", + "fi": "finnish", + "vi": "vietnamese", + "he": "hebrew", + "uk": "ukrainian", + "el": "greek", + "ms": "malay", + "cs": "czech", + "ro": "romanian", + "da": "danish", + "hu": "hungarian", + "ta": "tamil", + "no": "norwegian", + "th": "thai", + "ur": "urdu", + "hr": "croatian", + "bg": "bulgarian", + "lt": "lithuanian", + "la": "latin", + "mi": "maori", + "ml": "malayalam", + "cy": "welsh", + "sk": "slovak", + "te": "telugu", + "fa": "persian", + "lv": "latvian", + "bn": "bengali", + "sr": "serbian", + "az": "azerbaijani", + "sl": "slovenian", + "kn": "kannada", + "et": "estonian", + "mk": "macedonian", + "br": "breton", + "eu": "basque", + "is": "icelandic", + "hy": "armenian", + "ne": "nepali", + "mn": "mongolian", + "bs": "bosnian", + "kk": "kazakh", + "sq": "albanian", + "sw": "swahili", + "gl": "galician", + "mr": "marathi", + "pa": "punjabi", + "si": "sinhala", + "km": "khmer", + "sn": "shona", + "yo": "yoruba", + "so": "somali", + "af": "afrikaans", + "oc": "occitan", + "ka": "georgian", + "be": "belarusian", + "tg": "tajik", + "sd": "sindhi", + "gu": "gujarati", + "am": "amharic", + "yi": "yiddish", + "lo": "lao", + "uz": "uzbek", + "fo": "faroese", + "ht": "haitian creole", + "ps": "pashto", + "tk": "turkmen", + "nn": "nynorsk", + "mt": "maltese", + "sa": "sanskrit", + "lb": "luxembourgish", + "my": "myanmar", + "bo": "tibetan", + "tl": "tagalog", + "mg": "malagasy", + "as": "assamese", + "tt": "tatar", + "haw": "hawaiian", + "ln": "lingala", + "ha": "hausa", + "ba": "bashkir", + "jw": "javanese", + "su": "sundanese", +} + +logger = logging.getLogger(__name__) + + +@functools.lru_cache(2) +def encode_audio_stream(path, audio_stream_language=None): + logger.debug("Encoding audio stream to WAV with ffmpeg") + + try: + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + inp = ffmpeg.input(path, threads=0) + if audio_stream_language: + logger.debug(f"Whisper will only use the {audio_stream_language} audio stream for {path}") + inp = inp[f'a:m:language:{audio_stream_language}'] + + out, _ = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000) \ + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + + except ffmpeg.Error as e: + raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e + + logger.debug(f"Finished encoding audio stream in {path} with no errors") + + return out + + +def whisper_get_language(code, name): + # Whisper uses an inconsistent mix of alpha2 and alpha3 language codes + try: + return Language.fromalpha2(code) + except LanguageReverseError: + return Language.fromname(name) + + +def whisper_get_language_reverse(alpha3): + # Returns the whisper language code given an alpha3b language + for wl in whisper_languages: + lan = whisper_get_language(wl, whisper_languages[wl]) + if lan.alpha3 == alpha3: + return wl + raise ValueError + + +class WhisperAISubtitle(Subtitle): + '''Whisper AI Subtitle.''' + provider_name = 'whisperai' + hash_verifiable = False + + def __init__(self, language, video): + super(WhisperAISubtitle, self).__init__(language) + + self.video = video + self.task = None + self.audio_language = None + self.force_audio_stream = None + + @property + def id(self): + return self.video.original_name + + def get_matches(self, video): + matches = set() + + if isinstance(video, Episode): + matches.update(["series", "season", "episode"]) + elif isinstance(video, Movie): + matches.update(["title"]) + + return matches + + +class WhisperAIProvider(Provider): + '''Whisper AI Provider.''' + + languages = set() + + for lan in whisper_languages: + languages.update({whisper_get_language(lan, whisper_languages[lan])}) + + languages.update(set(Language.rebuild(lang, hi=True) for lang in languages)) + languages.update(set(Language.rebuild(lang, forced=True) for lang in languages)) + + video_types = (Episode, Movie) + + def __init__(self, endpoint=None, timeout=None): + if not endpoint: + raise ConfigurationError('Whisper Web Service Endpoint must be provided') + + if not timeout: + raise ConfigurationError('Whisper Web Service Timeout must be provided') + + self.endpoint = endpoint + self.timeout = int(timeout) + self.session = None + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ + + def terminate(self): + self.session.close() + + + @functools.lru_cache(2048) + def detect_language(self, path) -> Language: + out = encode_audio_stream(path) + + r = self.session.post(f"{self.endpoint}/detect-language", + params={'encode': 'false'}, + files={'audio_file': out}, + timeout=self.timeout) + + logger.info(f"Whisper detected language of {path} as {r.json()['detected_language']}") + + return whisper_get_language(r.json()["language_code"], r.json()["detected_language"]) + + def query(self, language, video): + if language not in self.languages: + return None + + sub = WhisperAISubtitle(language, video) + sub.task = "transcribe" + + if video.audio_languages: + if language.alpha3 in video.audio_languages: + sub.audio_language = language.alpha3 + if len(list(video.audio_languages)) > 1: + sub.force_audio_stream = language.alpha3 + else: + sub.task = "translate" + sub.audio_language = list(video.audio_languages)[0] + else: + # We must detect the language manually + detected_lang = self.detect_language(video.original_path) + + if detected_lang != language: + sub.task = "translate" + + sub.audio_language = detected_lang.alpha3 + + if sub.task == "translate": + if language.alpha3 != "eng": + logger.info(f"Translation only possible from {language} to English") + return None + + logger.debug(f"Whisper ({video.original_path}): {sub.audio_language} -> {language.alpha3} [TASK: {sub.task}]") + + return sub + + def list_subtitles(self, video, languages): + subtitles = [self.query(l, video) for l in languages] + return [s for s in subtitles if s is not None] + + def download_subtitle(self, subtitle: WhisperAISubtitle): + # Invoke Whisper through the API. This may take a long time depending on the file. + # TODO: This loads the entire file into memory, find a good way to stream the file in chunks + + out = encode_audio_stream(subtitle.video.original_path, subtitle.force_audio_stream) + + r = self.session.post(f"{self.endpoint}/asr", + params={'task': subtitle.task, 'language': whisper_get_language_reverse(subtitle.audio_language), 'output': 'srt', 'encode': 'false'}, + files={'audio_file': out}, + timeout=self.timeout) + + subtitle.content = r.content