|
|
|
from __future__ import absolute_import
|
|
|
|
import logging
|
|
|
|
import time
|
|
|
|
from datetime import timedelta
|
|
|
|
|
|
|
|
from requests import Session
|
|
|
|
|
|
|
|
from requests.exceptions import JSONDecodeError
|
|
|
|
from subliminal_patch.subtitle import Subtitle
|
|
|
|
from subliminal_patch.providers import Provider
|
|
|
|
from subliminal import __short_version__
|
|
|
|
from subliminal.exceptions import ConfigurationError
|
|
|
|
from subzero.language import Language
|
|
|
|
from subliminal.video import Episode, Movie
|
|
|
|
|
|
|
|
from babelfish.exceptions import LanguageReverseError
|
|
|
|
|
|
|
|
import ffmpeg
|
|
|
|
import functools
|
|
|
|
from pycountry import languages
|
|
|
|
|
|
|
|
# These are all the languages Whisper supports.
|
|
|
|
# from whisper.tokenizer import LANGUAGES
|
|
|
|
|
|
|
|
whisper_languages = {
|
|
|
|
"en": "english",
|
|
|
|
"zh": "chinese",
|
|
|
|
"de": "german",
|
|
|
|
"es": "spanish",
|
|
|
|
"ru": "russian",
|
|
|
|
"ko": "korean",
|
|
|
|
"fr": "french",
|
|
|
|
"ja": "japanese",
|
|
|
|
"pt": "portuguese",
|
|
|
|
"tr": "turkish",
|
|
|
|
"pl": "polish",
|
|
|
|
"ca": "catalan",
|
|
|
|
"nl": "dutch",
|
|
|
|
"ar": "arabic",
|
|
|
|
"sv": "swedish",
|
|
|
|
"it": "italian",
|
|
|
|
"id": "indonesian",
|
|
|
|
"hi": "hindi",
|
|
|
|
"fi": "finnish",
|
|
|
|
"vi": "vietnamese",
|
|
|
|
"he": "hebrew",
|
|
|
|
"uk": "ukrainian",
|
|
|
|
"el": "greek",
|
|
|
|
"ms": "malay",
|
|
|
|
"cs": "czech",
|
|
|
|
"ro": "romanian",
|
|
|
|
"da": "danish",
|
|
|
|
"hu": "hungarian",
|
|
|
|
"ta": "tamil",
|
|
|
|
"no": "norwegian",
|
|
|
|
"th": "thai",
|
|
|
|
"ur": "urdu",
|
|
|
|
"hr": "croatian",
|
|
|
|
"bg": "bulgarian",
|
|
|
|
"lt": "lithuanian",
|
|
|
|
"la": "latin",
|
|
|
|
"mi": "maori",
|
|
|
|
"ml": "malayalam",
|
|
|
|
"cy": "welsh",
|
|
|
|
"sk": "slovak",
|
|
|
|
"te": "telugu",
|
|
|
|
"fa": "persian",
|
|
|
|
"lv": "latvian",
|
|
|
|
"bn": "bengali",
|
|
|
|
"sr": "serbian",
|
|
|
|
"az": "azerbaijani",
|
|
|
|
"sl": "slovenian",
|
|
|
|
"kn": "kannada",
|
|
|
|
"et": "estonian",
|
|
|
|
"mk": "macedonian",
|
|
|
|
"br": "breton",
|
|
|
|
"eu": "basque",
|
|
|
|
"is": "icelandic",
|
|
|
|
"hy": "armenian",
|
|
|
|
"ne": "nepali",
|
|
|
|
"mn": "mongolian",
|
|
|
|
"bs": "bosnian",
|
|
|
|
"kk": "kazakh",
|
|
|
|
"sq": "albanian",
|
|
|
|
"sw": "swahili",
|
|
|
|
"gl": "galician",
|
|
|
|
"mr": "marathi",
|
|
|
|
"pa": "punjabi",
|
|
|
|
"si": "sinhala",
|
|
|
|
"km": "khmer",
|
|
|
|
"sn": "shona",
|
|
|
|
"yo": "yoruba",
|
|
|
|
"so": "somali",
|
|
|
|
"af": "afrikaans",
|
|
|
|
"oc": "occitan",
|
|
|
|
"ka": "georgian",
|
|
|
|
"be": "belarusian",
|
|
|
|
"tg": "tajik",
|
|
|
|
"sd": "sindhi",
|
|
|
|
"gu": "gujarati",
|
|
|
|
"am": "amharic",
|
|
|
|
"yi": "yiddish",
|
|
|
|
"lo": "lao",
|
|
|
|
"uz": "uzbek",
|
|
|
|
"fo": "faroese",
|
|
|
|
"ht": "haitian creole",
|
|
|
|
"ps": "pashto",
|
|
|
|
"tk": "turkmen",
|
|
|
|
"nn": "nynorsk",
|
|
|
|
"mt": "maltese",
|
|
|
|
"sa": "sanskrit",
|
|
|
|
"lb": "luxembourgish",
|
|
|
|
"my": "myanmar",
|
|
|
|
"bo": "tibetan",
|
|
|
|
"tl": "tagalog",
|
|
|
|
"mg": "malagasy",
|
|
|
|
"as": "assamese",
|
|
|
|
"tt": "tatar",
|
|
|
|
"haw": "hawaiian",
|
|
|
|
"ln": "lingala",
|
|
|
|
"ha": "hausa",
|
|
|
|
"ba": "bashkir",
|
|
|
|
"jw": "javanese",
|
|
|
|
"su": "sundanese",
|
|
|
|
}
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
def set_log_level(newLevel="INFO"):
|
|
|
|
newLevel = newLevel.upper()
|
|
|
|
# print(f'WhisperAI log level changing from {logging._levelToName[logger.getEffectiveLevel()]} to {newLevel}')
|
|
|
|
logger.setLevel(getattr(logging, newLevel))
|
|
|
|
|
|
|
|
# initialize to default above
|
|
|
|
set_log_level()
|
|
|
|
|
|
|
|
# ffmpeg uses the older ISO 639-2 code when extracting audio streams based on language
|
|
|
|
# if we give it the newer ISO 639-3 code it can't find that audio stream by name because it's different
|
|
|
|
# for example it wants 'ger' instead of 'deu' for the German language
|
|
|
|
# or 'fre' instead of 'fra' for the French language
|
|
|
|
def get_ISO_639_2_code(iso639_3_code):
|
|
|
|
# find the language using ISO 639-3 code
|
|
|
|
language = languages.get(alpha_3=iso639_3_code)
|
|
|
|
# get the ISO 639-2 code or use the original input if there isn't a match
|
|
|
|
iso639_2_code = language.bibliographic if language and hasattr(language, 'bibliographic') else iso639_3_code
|
|
|
|
logger.debug(f"ffmpeg using language code '{iso639_2_code}' (instead of '{iso639_3_code}')")
|
|
|
|
return iso639_2_code
|
|
|
|
|
|
|
|
@functools.lru_cache(2)
|
|
|
|
def encode_audio_stream(path, ffmpeg_path, audio_stream_language=None):
|
|
|
|
logger.debug("Encoding audio stream to WAV with ffmpeg")
|
|
|
|
|
|
|
|
try:
|
|
|
|
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
|
|
|
|
inp = ffmpeg.input(path, threads=0)
|
|
|
|
if audio_stream_language:
|
|
|
|
# There is more than one audio stream, so pick the requested one by name
|
|
|
|
# Use the ISO 639-2 code if available
|
|
|
|
audio_stream_language = get_ISO_639_2_code(audio_stream_language)
|
|
|
|
logger.debug(f"Whisper will use the '{audio_stream_language}' audio stream for {path}")
|
|
|
|
inp = inp[f'a:m:language:{audio_stream_language}']
|
|
|
|
|
|
|
|
out, _ = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000, af="aresample=async=1") \
|
|
|
|
.run(cmd=[ffmpeg_path, "-nostdin"], capture_stdout=True, capture_stderr=True)
|
|
|
|
|
|
|
|
except ffmpeg.Error as e:
|
|
|
|
logger.warning(f"ffmpeg failed to load audio: {e.stderr.decode()}")
|
|
|
|
return None
|
|
|
|
|
|
|
|
logger.debug(f"Finished encoding audio stream in {path} with no errors")
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
def whisper_get_language(code, name):
|
|
|
|
# Whisper uses an inconsistent mix of alpha2 and alpha3 language codes
|
|
|
|
try:
|
|
|
|
return Language.fromalpha2(code)
|
|
|
|
except LanguageReverseError:
|
|
|
|
return Language.fromname(name)
|
|
|
|
|
|
|
|
|
|
|
|
def whisper_get_language_reverse(alpha3):
|
|
|
|
# Returns the whisper language code given an alpha3b language
|
|
|
|
for wl in whisper_languages:
|
|
|
|
lan = whisper_get_language(wl, whisper_languages[wl])
|
|
|
|
if lan.alpha3 == alpha3:
|
|
|
|
return wl
|
|
|
|
return None
|
|
|
|
|
|
|
|
def language_from_alpha3(lang):
|
|
|
|
name = Language(lang).name
|
|
|
|
return name
|
|
|
|
|
|
|
|
class WhisperAISubtitle(Subtitle):
|
|
|
|
'''Whisper AI Subtitle.'''
|
|
|
|
provider_name = 'whisperai'
|
|
|
|
hash_verifiable = False
|
|
|
|
|
|
|
|
def __init__(self, language, video):
|
|
|
|
super(WhisperAISubtitle, self).__init__(language)
|
|
|
|
|
|
|
|
self.video = video
|
|
|
|
self.task = None
|
|
|
|
self.audio_language = None
|
|
|
|
self.force_audio_stream = None
|
|
|
|
|
|
|
|
@property
|
|
|
|
def id(self):
|
|
|
|
return self.video.original_name
|
|
|
|
|
|
|
|
def get_matches(self, video):
|
|
|
|
matches = set()
|
|
|
|
|
|
|
|
if isinstance(video, Episode):
|
|
|
|
matches.update(["series", "season", "episode"])
|
|
|
|
elif isinstance(video, Movie):
|
|
|
|
matches.update(["title"])
|
|
|
|
|
|
|
|
return matches
|
|
|
|
|
|
|
|
|
|
|
|
class WhisperAIProvider(Provider):
|
|
|
|
'''Whisper AI Provider.'''
|
|
|
|
|
|
|
|
languages = set()
|
|
|
|
|
|
|
|
for lan in whisper_languages:
|
|
|
|
languages.update({whisper_get_language(lan, whisper_languages[lan])})
|
|
|
|
|
|
|
|
video_types = (Episode, Movie)
|
|
|
|
|
|
|
|
def __init__(self, endpoint=None, response=None, timeout=None, ffmpeg_path=None, loglevel=None):
|
|
|
|
set_log_level(loglevel)
|
|
|
|
if not endpoint:
|
|
|
|
raise ConfigurationError('Whisper Web Service Endpoint must be provided')
|
|
|
|
|
|
|
|
if not response:
|
|
|
|
raise ConfigurationError('Whisper Web Service Connection/response timeout must be provided')
|
|
|
|
|
|
|
|
if not timeout:
|
|
|
|
raise ConfigurationError('Whisper Web Service Transcription/translation timeout must be provided')
|
|
|
|
|
|
|
|
if not ffmpeg_path:
|
|
|
|
raise ConfigurationError("ffmpeg path must be provided")
|
|
|
|
|
|
|
|
self.endpoint = endpoint.rstrip("/")
|
|
|
|
self.response = int(response)
|
|
|
|
self.timeout = int(timeout)
|
|
|
|
self.session = None
|
|
|
|
self.ffmpeg_path = ffmpeg_path
|
|
|
|
|
|
|
|
def initialize(self):
|
|
|
|
self.session = Session()
|
|
|
|
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
|
|
|
|
|
|
|
def terminate(self):
|
|
|
|
self.session.close()
|
|
|
|
|
|
|
|
|
|
|
|
@functools.lru_cache(2048)
|
|
|
|
def detect_language(self, path) -> Language:
|
|
|
|
out = encode_audio_stream(path, self.ffmpeg_path)
|
|
|
|
|
|
|
|
if out == None:
|
|
|
|
logger.info(f"Whisper cannot detect language of {path} because of missing/bad audio track")
|
|
|
|
return None
|
|
|
|
|
|
|
|
r = self.session.post(f"{self.endpoint}/detect-language",
|
|
|
|
params={'encode': 'false'},
|
|
|
|
files={'audio_file': out},
|
|
|
|
timeout=(self.response, self.timeout))
|
|
|
|
|
|
|
|
try:
|
|
|
|
results = r.json()
|
|
|
|
except JSONDecodeError:
|
|
|
|
results = {}
|
|
|
|
|
|
|
|
if len(results) == 0:
|
|
|
|
logger.info(f"Whisper returned empty response when detecting language")
|
|
|
|
return None
|
|
|
|
|
|
|
|
logger.debug(f"Whisper detected language of {path} as {results['detected_language']}")
|
|
|
|
|
|
|
|
return whisper_get_language(results["language_code"], results["detected_language"])
|
|
|
|
|
|
|
|
def query(self, language, video):
|
|
|
|
if language not in self.languages:
|
|
|
|
return None
|
|
|
|
|
|
|
|
sub = WhisperAISubtitle(language, video)
|
|
|
|
sub.task = "transcribe"
|
|
|
|
|
|
|
|
if video.audio_languages and not (list(video.audio_languages)[0] == "und" and len(video.audio_languages) == 1):
|
|
|
|
if language.alpha3 in video.audio_languages:
|
|
|
|
sub.audio_language = language.alpha3
|
|
|
|
if len(list(video.audio_languages)) > 1:
|
|
|
|
sub.force_audio_stream = language.alpha3
|
|
|
|
else:
|
|
|
|
sub.task = "translate"
|
|
|
|
|
|
|
|
eligible_languages = list(video.audio_languages)
|
|
|
|
if len(eligible_languages) > 1:
|
|
|
|
if "und" in eligible_languages:
|
|
|
|
eligible_languages.remove("und")
|
|
|
|
sub.audio_language = eligible_languages[0]
|
|
|
|
else:
|
|
|
|
# We must detect the language manually
|
|
|
|
detected_lang = self.detect_language(video.original_path)
|
|
|
|
if detected_lang == None:
|
|
|
|
sub.task = "error"
|
|
|
|
# tell the user what is wrong
|
|
|
|
sub.release_info = "bad/missing audio track - cannot transcribe"
|
|
|
|
return sub
|
|
|
|
|
|
|
|
if detected_lang != language:
|
|
|
|
sub.task = "translate"
|
|
|
|
|
|
|
|
sub.audio_language = detected_lang.alpha3
|
|
|
|
|
|
|
|
if sub.task == "translate":
|
|
|
|
if language.alpha3 != "eng":
|
|
|
|
logger.debug(f"Translation only possible from {language} to English")
|
|
|
|
return None
|
|
|
|
|
|
|
|
# tell the user what we are about to do
|
|
|
|
sub.release_info = f"{sub.task} {language_from_alpha3(sub.audio_language)} audio -> {language_from_alpha3(language.alpha3)} SRT"
|
|
|
|
logger.debug(f"Whisper query: ({video.original_path}): {sub.audio_language} -> {language.alpha3} [TASK: {sub.task}]")
|
|
|
|
|
|
|
|
return sub
|
|
|
|
|
|
|
|
def list_subtitles(self, video, languages):
|
|
|
|
subtitles = [self.query(l, video) for l in languages]
|
|
|
|
return [s for s in subtitles if s is not None]
|
|
|
|
|
|
|
|
def download_subtitle(self, subtitle: WhisperAISubtitle):
|
|
|
|
# Invoke Whisper through the API. This may take a long time depending on the file.
|
|
|
|
# TODO: This loads the entire file into memory, find a good way to stream the file in chunks
|
|
|
|
|
|
|
|
out = None
|
|
|
|
if subtitle.task != "error":
|
|
|
|
out = encode_audio_stream(subtitle.video.original_path, self.ffmpeg_path, subtitle.force_audio_stream)
|
|
|
|
if out == None:
|
|
|
|
logger.info(f"Whisper cannot process {subtitle.video.original_path} because of missing/bad audio track")
|
|
|
|
subtitle.content = None
|
|
|
|
return
|
|
|
|
|
|
|
|
logger.debug(f'Audio stream length (in WAV format) is {len(out):,} bytes')
|
|
|
|
|
|
|
|
if subtitle.task == "transcribe":
|
|
|
|
output_language = subtitle.audio_language
|
|
|
|
else:
|
|
|
|
output_language = "eng"
|
|
|
|
|
|
|
|
input_language = whisper_get_language_reverse(subtitle.audio_language)
|
|
|
|
if input_language is None:
|
|
|
|
if output_language == "eng":
|
|
|
|
# guess that audio track is mislabelled English and let whisper try to transcribe it
|
|
|
|
input_language = "en"
|
|
|
|
subtitle.task = "transcribe"
|
|
|
|
logger.info(f"Whisper treating unsupported audio track language: '{subtitle.audio_language}' as English")
|
|
|
|
else:
|
|
|
|
logger.info(f"Whisper cannot process {subtitle.video.original_path} because of unsupported audio track language: '{subtitle.audio_language}'")
|
|
|
|
subtitle.content = None
|
|
|
|
return
|
|
|
|
|
|
|
|
logger.info(f'Starting WhisperAI {subtitle.task} to {language_from_alpha3(output_language)} for {subtitle.video.original_path}')
|
|
|
|
startTime = time.time()
|
|
|
|
|
|
|
|
r = self.session.post(f"{self.endpoint}/asr",
|
|
|
|
params={'task': subtitle.task, 'language': input_language, 'output': 'srt', 'encode': 'false'},
|
|
|
|
files={'audio_file': out},
|
|
|
|
timeout=(self.response, self.timeout))
|
|
|
|
|
|
|
|
endTime = time.time()
|
|
|
|
elapsedTime = timedelta(seconds=round(endTime - startTime))
|
|
|
|
|
|
|
|
# for debugging, log if anything got returned
|
|
|
|
subtitle_length = len(r.content)
|
|
|
|
logger.debug(f'Returned subtitle length is {subtitle_length:,} bytes')
|
|
|
|
subtitle_length = min(subtitle_length, 1000)
|
|
|
|
if subtitle_length > 0:
|
|
|
|
logger.debug(f'First {subtitle_length} bytes of subtitle: {r.content[0:subtitle_length]}')
|
|
|
|
|
|
|
|
logger.info(f'Completed WhisperAI {subtitle.task} to {language_from_alpha3(output_language)} in {elapsedTime} for {subtitle.video.original_path}')
|
|
|
|
|
|
|
|
subtitle.content = r.content
|