Added OpenAI Whisper Provider

2 years ago · 8b1db07e9f
parent 1427a8ab73
commit 8b1db07e9f
4 changed files with 313 additions and 0 deletions
--- a/bazarr/app/config.py
+++ b/bazarr/app/config.py
@ -170,6 +170,10 @@ defaults = {
    'subf2m': {
        'verify_ssl': 'True'
    },
+    'whisperai': {
+        'endpoint': 'http://127.0.0.1:9000',
+        'timeout': '3600'
+    },
    'legendasdivx': {
        'username': '',
        'password': '',
--- a/bazarr/app/get_providers.py
+++ b/bazarr/app/get_providers.py
@ -252,6 +252,10 @@ def get_providers_auth():
        'subf2m': {
            'verify_ssl': settings.subf2m.getboolean('verify_ssl')
        },
+        'whisperai': {
+            'endpoint': settings.whisperai.endpoint,
+            'timeout': settings.whisperai.timeout
+        }
    }


--- a/frontend/src/pages/Settings/Providers/list.ts
+++ b/frontend/src/pages/Settings/Providers/list.ts
@ -206,6 +206,25 @@ export const ProviderList: Readonly<ProviderInfo[]> = [
    ],
  },
  { key: "napiprojekt", description: "Polish Subtitles Provider" },
+  {
+    key: "whisperai",
+    name: "Whisper",
+    description: "AI Generated Subtitles powered by Whisper",
+    inputs: [
+      {
+        type: "text",
+        key: "endpoint",
+        defaultValue: "http://127.0.0.1:9000",
+        name: "Whisper ASR Docker Endpoint",
+      },
+      {
+        type: "text",
+        key: "timeout",
+        defaultValue: 3600,
+        name: "Transcription/translation timeout in seconds",
+      },
+    ],
+  },
  {
    key: "napisy24",
    description: "Polish Subtitles Provider",
--- a/libs/subliminal_patch/providers/whisperai.py
+++ b/libs/subliminal_patch/providers/whisperai.py
@ -0,0 +1,286 @@
+from __future__ import absolute_import
+import logging
+
+from requests import Session
+
+from subliminal_patch.subtitle import Subtitle
+from subliminal_patch.providers import Provider
+from subliminal import __short_version__
+from subliminal.exceptions import ConfigurationError
+from subzero.language import Language
+from subliminal.video import Episode, Movie
+
+from babelfish.exceptions import LanguageReverseError
+
+import ffmpeg
+import functools
+
+# These are all the languages Whisper supports.
+# from whisper.tokenizer import LANGUAGES
+
+whisper_languages = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+}
+
+logger = logging.getLogger(__name__)
+
+
+@functools.lru_cache(2)
+def encode_audio_stream(path, audio_stream_language=None):
+    logger.debug("Encoding audio stream to WAV with ffmpeg")
+
+    try:
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        inp = ffmpeg.input(path, threads=0)
+        if audio_stream_language:
+            logger.debug(f"Whisper will only use the {audio_stream_language} audio stream for {path}")
+            inp = inp[f'a:m:language:{audio_stream_language}']
+
+        out, _ = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000) \
+                    .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+
+    logger.debug(f"Finished encoding audio stream in {path} with no errors")
+
+    return out
+
+
+def whisper_get_language(code, name):
+    # Whisper uses an inconsistent mix of alpha2 and alpha3 language codes
+    try:
+        return Language.fromalpha2(code)
+    except LanguageReverseError:
+        return Language.fromname(name)
+
+
+def whisper_get_language_reverse(alpha3):
+    # Returns the whisper language code given an alpha3b language
+    for wl in whisper_languages:
+        lan = whisper_get_language(wl, whisper_languages[wl])
+        if lan.alpha3 == alpha3:
+            return wl
+    raise ValueError
+
+
+class WhisperAISubtitle(Subtitle):
+    '''Whisper AI Subtitle.'''
+    provider_name = 'whisperai'
+    hash_verifiable = False
+
+    def __init__(self, language, video):
+        super(WhisperAISubtitle, self).__init__(language)
+
+        self.video = video
+        self.task = None
+        self.audio_language = None
+        self.force_audio_stream = None
+
+    @property
+    def id(self):
+        return self.video.original_name
+
+    def get_matches(self, video):
+        matches = set()
+
+        if isinstance(video, Episode):
+            matches.update(["series", "season", "episode"])
+        elif isinstance(video, Movie):
+            matches.update(["title"])
+
+        return matches
+
+
+class WhisperAIProvider(Provider):
+    '''Whisper AI Provider.'''
+
+    languages = set()
+
+    for lan in whisper_languages:
+        languages.update({whisper_get_language(lan, whisper_languages[lan])})
+
+    languages.update(set(Language.rebuild(lang, hi=True) for lang in languages))
+    languages.update(set(Language.rebuild(lang, forced=True) for lang in languages))
+
+    video_types = (Episode, Movie)
+
+    def __init__(self, endpoint=None, timeout=None):
+        if not endpoint:
+            raise ConfigurationError('Whisper Web Service Endpoint must be provided')
+
+        if not timeout:
+            raise ConfigurationError('Whisper Web Service Timeout must be provided')
+
+        self.endpoint = endpoint
+        self.timeout = int(timeout)
+        self.session = None
+
+    def initialize(self):
+        self.session = Session()
+        self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
+
+    def terminate(self):
+        self.session.close()
+
+
+    @functools.lru_cache(2048)
+    def detect_language(self, path) -> Language:
+        out = encode_audio_stream(path)
+
+        r = self.session.post(f"{self.endpoint}/detect-language",
+                              params={'encode': 'false'},
+                              files={'audio_file': out},
+                              timeout=self.timeout)
+
+        logger.info(f"Whisper detected language of {path} as {r.json()['detected_language']}")
+
+        return whisper_get_language(r.json()["language_code"], r.json()["detected_language"])
+
+    def query(self, language, video):
+        if language not in self.languages:
+            return None
+
+        sub = WhisperAISubtitle(language, video)
+        sub.task = "transcribe"
+
+        if video.audio_languages:
+            if language.alpha3 in video.audio_languages:
+                sub.audio_language = language.alpha3
+                if len(list(video.audio_languages)) > 1:
+                    sub.force_audio_stream = language.alpha3
+            else:
+                sub.task = "translate"
+                sub.audio_language = list(video.audio_languages)[0]
+        else:
+            # We must detect the language manually
+            detected_lang = self.detect_language(video.original_path)
+
+            if detected_lang != language:
+                sub.task = "translate"
+
+            sub.audio_language = detected_lang.alpha3
+
+        if sub.task == "translate":
+            if language.alpha3 != "eng":
+                logger.info(f"Translation only possible from {language} to English")
+                return None
+
+        logger.debug(f"Whisper ({video.original_path}): {sub.audio_language} -> {language.alpha3} [TASK: {sub.task}]")
+
+        return sub
+
+    def list_subtitles(self, video, languages):
+        subtitles = [self.query(l, video) for l in languages]
+        return [s for s in subtitles if s is not None]
+
+    def download_subtitle(self, subtitle: WhisperAISubtitle):
+        # Invoke Whisper through the API. This may take a long time depending on the file.
+        # TODO: This loads the entire file into memory, find a good way to stream the file in chunks
+
+        out = encode_audio_stream(subtitle.video.original_path, subtitle.force_audio_stream)
+
+        r = self.session.post(f"{self.endpoint}/asr",
+                              params={'task': subtitle.task, 'language': whisper_get_language_reverse(subtitle.audio_language), 'output': 'srt', 'encode': 'false'},
+                              files={'audio_file': out},
+                              timeout=self.timeout)
+
+        subtitle.content = r.content