Added OpenAI Whisper Provider

pull/2110/head
Alex Yancey 2 years ago committed by GitHub
parent 1427a8ab73
commit 8b1db07e9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -170,6 +170,10 @@ defaults = {
'subf2m': { 'subf2m': {
'verify_ssl': 'True' 'verify_ssl': 'True'
}, },
'whisperai': {
'endpoint': 'http://127.0.0.1:9000',
'timeout': '3600'
},
'legendasdivx': { 'legendasdivx': {
'username': '', 'username': '',
'password': '', 'password': '',

@ -252,6 +252,10 @@ def get_providers_auth():
'subf2m': { 'subf2m': {
'verify_ssl': settings.subf2m.getboolean('verify_ssl') 'verify_ssl': settings.subf2m.getboolean('verify_ssl')
}, },
'whisperai': {
'endpoint': settings.whisperai.endpoint,
'timeout': settings.whisperai.timeout
}
} }

@ -206,6 +206,25 @@ export const ProviderList: Readonly<ProviderInfo[]> = [
], ],
}, },
{ key: "napiprojekt", description: "Polish Subtitles Provider" }, { key: "napiprojekt", description: "Polish Subtitles Provider" },
{
key: "whisperai",
name: "Whisper",
description: "AI Generated Subtitles powered by Whisper",
inputs: [
{
type: "text",
key: "endpoint",
defaultValue: "http://127.0.0.1:9000",
name: "Whisper ASR Docker Endpoint",
},
{
type: "text",
key: "timeout",
defaultValue: 3600,
name: "Transcription/translation timeout in seconds",
},
],
},
{ {
key: "napisy24", key: "napisy24",
description: "Polish Subtitles Provider", description: "Polish Subtitles Provider",

@ -0,0 +1,286 @@
from __future__ import absolute_import
import logging
from requests import Session
from subliminal_patch.subtitle import Subtitle
from subliminal_patch.providers import Provider
from subliminal import __short_version__
from subliminal.exceptions import ConfigurationError
from subzero.language import Language
from subliminal.video import Episode, Movie
from babelfish.exceptions import LanguageReverseError
import ffmpeg
import functools
# These are all the languages Whisper supports.
# from whisper.tokenizer import LANGUAGES
whisper_languages = {
"en": "english",
"zh": "chinese",
"de": "german",
"es": "spanish",
"ru": "russian",
"ko": "korean",
"fr": "french",
"ja": "japanese",
"pt": "portuguese",
"tr": "turkish",
"pl": "polish",
"ca": "catalan",
"nl": "dutch",
"ar": "arabic",
"sv": "swedish",
"it": "italian",
"id": "indonesian",
"hi": "hindi",
"fi": "finnish",
"vi": "vietnamese",
"he": "hebrew",
"uk": "ukrainian",
"el": "greek",
"ms": "malay",
"cs": "czech",
"ro": "romanian",
"da": "danish",
"hu": "hungarian",
"ta": "tamil",
"no": "norwegian",
"th": "thai",
"ur": "urdu",
"hr": "croatian",
"bg": "bulgarian",
"lt": "lithuanian",
"la": "latin",
"mi": "maori",
"ml": "malayalam",
"cy": "welsh",
"sk": "slovak",
"te": "telugu",
"fa": "persian",
"lv": "latvian",
"bn": "bengali",
"sr": "serbian",
"az": "azerbaijani",
"sl": "slovenian",
"kn": "kannada",
"et": "estonian",
"mk": "macedonian",
"br": "breton",
"eu": "basque",
"is": "icelandic",
"hy": "armenian",
"ne": "nepali",
"mn": "mongolian",
"bs": "bosnian",
"kk": "kazakh",
"sq": "albanian",
"sw": "swahili",
"gl": "galician",
"mr": "marathi",
"pa": "punjabi",
"si": "sinhala",
"km": "khmer",
"sn": "shona",
"yo": "yoruba",
"so": "somali",
"af": "afrikaans",
"oc": "occitan",
"ka": "georgian",
"be": "belarusian",
"tg": "tajik",
"sd": "sindhi",
"gu": "gujarati",
"am": "amharic",
"yi": "yiddish",
"lo": "lao",
"uz": "uzbek",
"fo": "faroese",
"ht": "haitian creole",
"ps": "pashto",
"tk": "turkmen",
"nn": "nynorsk",
"mt": "maltese",
"sa": "sanskrit",
"lb": "luxembourgish",
"my": "myanmar",
"bo": "tibetan",
"tl": "tagalog",
"mg": "malagasy",
"as": "assamese",
"tt": "tatar",
"haw": "hawaiian",
"ln": "lingala",
"ha": "hausa",
"ba": "bashkir",
"jw": "javanese",
"su": "sundanese",
}
logger = logging.getLogger(__name__)
@functools.lru_cache(2)
def encode_audio_stream(path, audio_stream_language=None):
logger.debug("Encoding audio stream to WAV with ffmpeg")
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
inp = ffmpeg.input(path, threads=0)
if audio_stream_language:
logger.debug(f"Whisper will only use the {audio_stream_language} audio stream for {path}")
inp = inp[f'a:m:language:{audio_stream_language}']
out, _ = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000) \
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
logger.debug(f"Finished encoding audio stream in {path} with no errors")
return out
def whisper_get_language(code, name):
# Whisper uses an inconsistent mix of alpha2 and alpha3 language codes
try:
return Language.fromalpha2(code)
except LanguageReverseError:
return Language.fromname(name)
def whisper_get_language_reverse(alpha3):
# Returns the whisper language code given an alpha3b language
for wl in whisper_languages:
lan = whisper_get_language(wl, whisper_languages[wl])
if lan.alpha3 == alpha3:
return wl
raise ValueError
class WhisperAISubtitle(Subtitle):
'''Whisper AI Subtitle.'''
provider_name = 'whisperai'
hash_verifiable = False
def __init__(self, language, video):
super(WhisperAISubtitle, self).__init__(language)
self.video = video
self.task = None
self.audio_language = None
self.force_audio_stream = None
@property
def id(self):
return self.video.original_name
def get_matches(self, video):
matches = set()
if isinstance(video, Episode):
matches.update(["series", "season", "episode"])
elif isinstance(video, Movie):
matches.update(["title"])
return matches
class WhisperAIProvider(Provider):
'''Whisper AI Provider.'''
languages = set()
for lan in whisper_languages:
languages.update({whisper_get_language(lan, whisper_languages[lan])})
languages.update(set(Language.rebuild(lang, hi=True) for lang in languages))
languages.update(set(Language.rebuild(lang, forced=True) for lang in languages))
video_types = (Episode, Movie)
def __init__(self, endpoint=None, timeout=None):
if not endpoint:
raise ConfigurationError('Whisper Web Service Endpoint must be provided')
if not timeout:
raise ConfigurationError('Whisper Web Service Timeout must be provided')
self.endpoint = endpoint
self.timeout = int(timeout)
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
def terminate(self):
self.session.close()
@functools.lru_cache(2048)
def detect_language(self, path) -> Language:
out = encode_audio_stream(path)
r = self.session.post(f"{self.endpoint}/detect-language",
params={'encode': 'false'},
files={'audio_file': out},
timeout=self.timeout)
logger.info(f"Whisper detected language of {path} as {r.json()['detected_language']}")
return whisper_get_language(r.json()["language_code"], r.json()["detected_language"])
def query(self, language, video):
if language not in self.languages:
return None
sub = WhisperAISubtitle(language, video)
sub.task = "transcribe"
if video.audio_languages:
if language.alpha3 in video.audio_languages:
sub.audio_language = language.alpha3
if len(list(video.audio_languages)) > 1:
sub.force_audio_stream = language.alpha3
else:
sub.task = "translate"
sub.audio_language = list(video.audio_languages)[0]
else:
# We must detect the language manually
detected_lang = self.detect_language(video.original_path)
if detected_lang != language:
sub.task = "translate"
sub.audio_language = detected_lang.alpha3
if sub.task == "translate":
if language.alpha3 != "eng":
logger.info(f"Translation only possible from {language} to English")
return None
logger.debug(f"Whisper ({video.original_path}): {sub.audio_language} -> {language.alpha3} [TASK: {sub.task}]")
return sub
def list_subtitles(self, video, languages):
subtitles = [self.query(l, video) for l in languages]
return [s for s in subtitles if s is not None]
def download_subtitle(self, subtitle: WhisperAISubtitle):
# Invoke Whisper through the API. This may take a long time depending on the file.
# TODO: This loads the entire file into memory, find a good way to stream the file in chunks
out = encode_audio_stream(subtitle.video.original_path, subtitle.force_audio_stream)
r = self.session.post(f"{self.endpoint}/asr",
params={'task': subtitle.task, 'language': whisper_get_language_reverse(subtitle.audio_language), 'output': 'srt', 'encode': 'false'},
files={'audio_file': out},
timeout=self.timeout)
subtitle.content = r.content
Loading…
Cancel
Save