You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
bazarr/custom_libs/subliminal_patch/providers/whisperai.py

391 lines
13 KiB

from __future__ import absolute_import
import logging
import time
from datetime import timedelta
from requests import Session
from requests.exceptions import JSONDecodeError
from subliminal_patch.subtitle import Subtitle
from subliminal_patch.providers import Provider
from subliminal import __short_version__
from subliminal.exceptions import ConfigurationError
from subzero.language import Language
from subliminal.video import Episode, Movie
from babelfish.exceptions import LanguageReverseError
import ffmpeg
import functools
from pycountry import languages
# These are all the languages Whisper supports.
# from whisper.tokenizer import LANGUAGES
whisper_languages = {
"en": "english",
"zh": "chinese",
"de": "german",
"es": "spanish",
"ru": "russian",
"ko": "korean",
"fr": "french",
"ja": "japanese",
"pt": "portuguese",
"tr": "turkish",
"pl": "polish",
"ca": "catalan",
"nl": "dutch",
"ar": "arabic",
"sv": "swedish",
"it": "italian",
"id": "indonesian",
"hi": "hindi",
"fi": "finnish",
"vi": "vietnamese",
"he": "hebrew",
"uk": "ukrainian",
"el": "greek",
"ms": "malay",
"cs": "czech",
"ro": "romanian",
"da": "danish",
"hu": "hungarian",
"ta": "tamil",
"no": "norwegian",
"th": "thai",
"ur": "urdu",
"hr": "croatian",
"bg": "bulgarian",
"lt": "lithuanian",
"la": "latin",
"mi": "maori",
"ml": "malayalam",
"cy": "welsh",
"sk": "slovak",
"te": "telugu",
"fa": "persian",
"lv": "latvian",
"bn": "bengali",
"sr": "serbian",
"az": "azerbaijani",
"sl": "slovenian",
"kn": "kannada",
"et": "estonian",
"mk": "macedonian",
"br": "breton",
"eu": "basque",
"is": "icelandic",
"hy": "armenian",
"ne": "nepali",
"mn": "mongolian",
"bs": "bosnian",
"kk": "kazakh",
"sq": "albanian",
"sw": "swahili",
"gl": "galician",
"mr": "marathi",
"pa": "punjabi",
"si": "sinhala",
"km": "khmer",
"sn": "shona",
"yo": "yoruba",
"so": "somali",
"af": "afrikaans",
"oc": "occitan",
"ka": "georgian",
"be": "belarusian",
"tg": "tajik",
"sd": "sindhi",
"gu": "gujarati",
"am": "amharic",
"yi": "yiddish",
"lo": "lao",
"uz": "uzbek",
"fo": "faroese",
"ht": "haitian creole",
"ps": "pashto",
"tk": "turkmen",
"nn": "nynorsk",
"mt": "maltese",
"sa": "sanskrit",
"lb": "luxembourgish",
"my": "myanmar",
"bo": "tibetan",
"tl": "tagalog",
"mg": "malagasy",
"as": "assamese",
"tt": "tatar",
"haw": "hawaiian",
"ln": "lingala",
"ha": "hausa",
"ba": "bashkir",
"jw": "javanese",
"su": "sundanese",
}
logger = logging.getLogger(__name__)
def set_log_level(newLevel="INFO"):
newLevel = newLevel.upper()
# print(f'WhisperAI log level changing from {logging._levelToName[logger.getEffectiveLevel()]} to {newLevel}')
logger.setLevel(getattr(logging, newLevel))
# initialize to default above
set_log_level()
# ffmpeg uses the older ISO 639-2 code when extracting audio streams based on language
# if we give it the newer ISO 639-3 code it can't find that audio stream by name because it's different
# for example it wants 'ger' instead of 'deu' for the German language
# or 'fre' instead of 'fra' for the French language
def get_ISO_639_2_code(iso639_3_code):
# find the language using ISO 639-3 code
language = languages.get(alpha_3=iso639_3_code)
# get the ISO 639-2 code or use the original input if there isn't a match
iso639_2_code = language.bibliographic if language and hasattr(language, 'bibliographic') else iso639_3_code
logger.debug(f"ffmpeg using language code '{iso639_2_code}' (instead of '{iso639_3_code}')")
return iso639_2_code
@functools.lru_cache(2)
def encode_audio_stream(path, ffmpeg_path, audio_stream_language=None):
logger.debug("Encoding audio stream to WAV with ffmpeg")
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
inp = ffmpeg.input(path, threads=0)
if audio_stream_language:
# There is more than one audio stream, so pick the requested one by name
# Use the ISO 639-2 code if available
audio_stream_language = get_ISO_639_2_code(audio_stream_language)
logger.debug(f"Whisper will use the '{audio_stream_language}' audio stream for {path}")
inp = inp[f'a:m:language:{audio_stream_language}']
out, _ = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000, af="aresample=async=1") \
.run(cmd=[ffmpeg_path, "-nostdin"], capture_stdout=True, capture_stderr=True)
except ffmpeg.Error as e:
logger.warning(f"ffmpeg failed to load audio: {e.stderr.decode()}")
return None
logger.debug(f"Finished encoding audio stream in {path} with no errors")
return out
def whisper_get_language(code, name):
# Whisper uses an inconsistent mix of alpha2 and alpha3 language codes
try:
return Language.fromalpha2(code)
except LanguageReverseError:
return Language.fromname(name)
def whisper_get_language_reverse(alpha3):
# Returns the whisper language code given an alpha3b language
for wl in whisper_languages:
lan = whisper_get_language(wl, whisper_languages[wl])
if lan.alpha3 == alpha3:
return wl
return None
def language_from_alpha3(lang):
name = Language(lang).name
return name
class WhisperAISubtitle(Subtitle):
'''Whisper AI Subtitle.'''
provider_name = 'whisperai'
hash_verifiable = False
def __init__(self, language, video):
super(WhisperAISubtitle, self).__init__(language)
self.video = video
self.task = None
self.audio_language = None
self.force_audio_stream = None
@property
def id(self):
# Construct unique id otherwise provider pool will think
# subtitles are all the same and drop all except the first one
# This is important for language profiles with more than one language
return f"{self.video.original_name}_{self.task}_{str(self.language)}"
def get_matches(self, video):
matches = set()
if isinstance(video, Episode):
matches.update(["series", "season", "episode"])
elif isinstance(video, Movie):
matches.update(["title"])
return matches
class WhisperAIProvider(Provider):
'''Whisper AI Provider.'''
languages = set()
for lan in whisper_languages:
languages.update({whisper_get_language(lan, whisper_languages[lan])})
video_types = (Episode, Movie)
def __init__(self, endpoint=None, response=None, timeout=None, ffmpeg_path=None, loglevel=None):
set_log_level(loglevel)
if not endpoint:
raise ConfigurationError('Whisper Web Service Endpoint must be provided')
if not response:
raise ConfigurationError('Whisper Web Service Connection/response timeout must be provided')
if not timeout:
raise ConfigurationError('Whisper Web Service Transcription/translation timeout must be provided')
if not ffmpeg_path:
raise ConfigurationError("ffmpeg path must be provided")
self.endpoint = endpoint.rstrip("/")
self.response = int(response)
self.timeout = int(timeout)
self.session = None
self.ffmpeg_path = ffmpeg_path
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
def terminate(self):
self.session.close()
@functools.lru_cache(2048)
def detect_language(self, path) -> Language:
out = encode_audio_stream(path, self.ffmpeg_path)
if out == None:
logger.info(f"Whisper cannot detect language of {path} because of missing/bad audio track")
return None
r = self.session.post(f"{self.endpoint}/detect-language",
params={'encode': 'false'},
files={'audio_file': out},
timeout=(self.response, self.timeout))
try:
results = r.json()
except JSONDecodeError:
results = {}
if len(results) == 0:
logger.info(f"Whisper returned empty response when detecting language")
return None
logger.debug(f"Whisper detected language of {path} as {results['detected_language']}")
return whisper_get_language(results["language_code"], results["detected_language"])
def query(self, language, video):
if language not in self.languages:
return None
sub = WhisperAISubtitle(language, video)
sub.task = "transcribe"
if video.audio_languages and not (list(video.audio_languages)[0] == "und" and len(video.audio_languages) == 1):
if language.alpha3 in video.audio_languages:
sub.audio_language = language.alpha3
if len(list(video.audio_languages)) > 1:
sub.force_audio_stream = language.alpha3
else:
sub.task = "translate"
eligible_languages = list(video.audio_languages)
if len(eligible_languages) > 1:
if "und" in eligible_languages:
eligible_languages.remove("und")
sub.audio_language = eligible_languages[0]
else:
# We must detect the language manually
detected_lang = self.detect_language(video.original_path)
if detected_lang == None:
sub.task = "error"
# tell the user what is wrong
sub.release_info = "bad/missing audio track - cannot transcribe"
return sub
if detected_lang != language:
sub.task = "translate"
sub.audio_language = detected_lang.alpha3
if sub.task == "translate":
if language.alpha3 != "eng":
logger.debug(f"Translation only possible from {language} to English")
return None
# tell the user what we are about to do
sub.release_info = f"{sub.task} {language_from_alpha3(sub.audio_language)} audio -> {language_from_alpha3(language.alpha3)} SRT"
logger.debug(f"Whisper query: ({video.original_path}): {sub.audio_language} -> {language.alpha3} [TASK: {sub.task}]")
return sub
def list_subtitles(self, video, languages):
subtitles = [self.query(l, video) for l in languages]
return [s for s in subtitles if s is not None]
def download_subtitle(self, subtitle: WhisperAISubtitle):
# Invoke Whisper through the API. This may take a long time depending on the file.
# TODO: This loads the entire file into memory, find a good way to stream the file in chunks
out = None
if subtitle.task != "error":
out = encode_audio_stream(subtitle.video.original_path, self.ffmpeg_path, subtitle.force_audio_stream)
if out == None:
logger.info(f"Whisper cannot process {subtitle.video.original_path} because of missing/bad audio track")
subtitle.content = None
return
logger.debug(f'Audio stream length (in WAV format) is {len(out):,} bytes')
if subtitle.task == "transcribe":
output_language = subtitle.audio_language
else:
output_language = "eng"
input_language = whisper_get_language_reverse(subtitle.audio_language)
if input_language is None:
if output_language == "eng":
# guess that audio track is mislabelled English and let whisper try to transcribe it
input_language = "en"
subtitle.task = "transcribe"
logger.info(f"Whisper treating unsupported audio track language: '{subtitle.audio_language}' as English")
else:
logger.info(f"Whisper cannot process {subtitle.video.original_path} because of unsupported audio track language: '{subtitle.audio_language}'")
subtitle.content = None
return
logger.info(f'Starting WhisperAI {subtitle.task} to {language_from_alpha3(output_language)} for {subtitle.video.original_path}')
startTime = time.time()
r = self.session.post(f"{self.endpoint}/asr",
params={'task': subtitle.task, 'language': input_language, 'output': 'srt', 'encode': 'false'},
files={'audio_file': out},
timeout=(self.response, self.timeout))
endTime = time.time()
elapsedTime = timedelta(seconds=round(endTime - startTime))
# for debugging, log if anything got returned
subtitle_length = len(r.content)
logger.debug(f'Returned subtitle length is {subtitle_length:,} bytes')
subtitle_length = min(subtitle_length, 1000)
if subtitle_length > 0:
logger.debug(f'First {subtitle_length} bytes of subtitle: {r.content[0:subtitle_length]}')
logger.info(f'Completed WhisperAI {subtitle.task} to {language_from_alpha3(output_language)} in {elapsedTime} for {subtitle.video.original_path}')
subtitle.content = r.content