diff --git a/libs/subliminal_patch/providers/regielive.py b/libs/subliminal_patch/providers/regielive.py index 2d97d7adb..d20972f03 100644 --- a/libs/subliminal_patch/providers/regielive.py +++ b/libs/subliminal_patch/providers/regielive.py @@ -4,29 +4,18 @@ import logging import io import os -#imports needed for the searchAPI -import re -import enum -import numpy as np -from time import sleep -from hashlib import sha1 -from subliminal.cache import region -from urllib import parse as urlparse -#end imports for searchAPI - -import zipfile - +from requests import Session from guessit import guessit -from subzero.language import Language from subliminal_patch.providers import Provider from subliminal_patch.subtitle import Subtitle, guess_matches -from subliminal.video import Episode, Movie from subliminal.subtitle import SUBTITLE_EXTENSIONS, fix_line_ending -from requests import Session, exceptions as req_exceptions +from subliminal.video import Episode, Movie +from subzero.language import Language +import urllib +import zipfile logger = logging.getLogger(__name__) -BASE_URL = "https://subtitrari.regielive.ro" class RegieLiveSubtitle(Subtitle): """RegieLive Subtitle.""" @@ -49,7 +38,7 @@ class RegieLiveSubtitle(Subtitle): def get_matches(self, video): type_ = "movie" if isinstance(video, Movie) else "episode" matches = set() - subtitle_filename = self.filename + subtitle_filename = self.filename.lower() # episode if type_ == "episode": @@ -60,9 +49,8 @@ class RegieLiveSubtitle(Subtitle): # already matched in search query matches.update(['title', 'year']) - # release_group if video.release_group and video.release_group.lower() in subtitle_filename: - matches.add('release_group') + matches.update(['release_group', 'hash']) matches |= guess_matches(video, guessit(self.filename, {"type": type_})) @@ -74,59 +62,66 @@ class RegieLiveProvider(Provider): languages = {Language(l) for l in ['ron']} language = list(languages)[0] video_types = (Episode, Movie) - download_cookies = None SEARCH_THROTTLE = 8 + hash_verifiable = False def __init__(self): self.initialize() def initialize(self): self.session = Session() - self.url = BASE_URL - self.api = 'API-KODI-KINGUL' + self.url = 'https://api.regielive.ro/bazarr/search.php' + self.api = 'API-BAZARR-YTZ-SL' self.headers = {'RL-API': self.api} def terminate(self): self.session.close() def query(self, video, language): - search_api = RegieLiveSearchAPI(video) - results = search_api.search_video() - subtitles = [] - if results: - for result in results: - logger.debug(result) - subtitles.append( - RegieLiveSubtitle(result.title, video, result.download_url, result.rating, language) - ) + payload = {} + if isinstance(video, Episode): + payload['nume'] = video.series + payload['sezon'] = video.season + payload['episod'] = video.episode + elif isinstance(video, Movie): + payload['nume'] = video.title + payload['an'] = video.year + + response = self.session.get( + self.url + "?" + urllib.parse.urlencode(payload), + data=payload, headers=self.headers) - # {'titlu': 'Chernobyl.S01E04.The.Happiness.of.All.Mankind.720p.AMZN.WEB-DL.DDP5.1.H.264-NTb', 'url': 'https://subtitrari.regielive.ro/descarca-33336-418567.zip', 'rating': {'nota': 4.89, 'voturi': 48}} - # subtitle def __init__(self, language, filename, subtype, video, link): - self.download_cookies = search_api.get_req_cookies() + subtitles = [] + if response.json()['cod'] == 200: + results_subs = response.json()['rezultate'] + for film in results_subs: + for sub in results_subs[film]['subtitrari']: + subtitles.append( + RegieLiveSubtitle( + results_subs[film]['subtitrari'][sub]['titlu'], + video, + results_subs[film]['subtitrari'][sub]['url'], + results_subs[film]['subtitrari'][sub]['rating']['nota'], + language)) return subtitles def list_subtitles(self, video, languages): return self.query(video, self.language) def download_subtitle(self, subtitle): - session = Session() + session = self.session _addheaders = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Origin': BASE_URL, - 'Accept-Language' : 'en-US,en;q=0.5', - 'Referer': BASE_URL, + 'Origin': 'https://subtitrari.regielive.ro', + 'Accept-Language': 'en-US,en;q=0.5', + 'Referer': 'https://subtitrari.regielive.ro', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache' } - session.headers.update(_addheaders) - if self.download_cookies is None: #try and get the needed cookies through a request if no cookies exist from the API - res = session.get(BASE_URL) - cookies = res.cookies - else: - cookies = self.download_cookies - + res = session.get('https://subtitrari.regielive.ro') + cookies = res.cookies _zipped = session.get(subtitle.page_link, cookies=cookies) if _zipped: if _zipped.text == '500': @@ -156,401 +151,3 @@ class RegieLiveProvider(Provider): return archive.read(name) raise APIThrottled('Can not find the subtitle in the compressed file') - -""" -# Search helper for Regielive that uses scraping to find subtitles -# This utility API should return a list of RegieLiveAPIData objects when queried -# by using a mix of json api search and page scraping in order to fetch data -# from Regielive website. -# -# This may break at anytime since regex is very sensitive to website structure changes -# for this in the future I might make the regex to load directly from github -""" - -#data classes -class RegieLiveAPIData(): - 'data returned class' - title = '' - rating = None - download_url = '' - - def __init__(self, title, url, rating): - self.title = title - self.download_url = url - self.rating = rating - - def __repr__(self): - return "" - - -class RegieLiveAPIRating(): # probably an extraneous class - 'rating for the subtitle' - rating = 0 - count = 0 - - def __init__(self, rating, count): - if rating: - self.rating = rating - - if not count: - self.count = 0 - if count and isinstance(count, str) and count.isnumeric(): - self.count = count - elif count == 'vot': - self.count = 1 - else: - self.count = 0 - - -# constants -CACHE_PREFIX = 'RL_API' - -DEFAULT_HEADERS = { - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ - AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", - 'Origin': 'https://subtitrari.regielive.ro', - 'Accept-Language': 'en-US,en;q=0.5', - 'Referer': 'https://subtitrari.regielive.ro', - 'Pragma': 'no-cache', - 'Cache-Control': 'no-cache' -} - -REQUEST_TIMEOUT = 15 - -LITE_JSON_PATH = "/ajax/subtitrari/searchsuggest.php" -PAGE_SEARCH_PATH = "/cauta.html" -SEASON_URL = "sezonul-%i/" - -SUB_PAGE_EPISODE_PATTERN = r'(?ism)

Episodul %s

(.+?)' -SUB_PAGE_MOVIE_MATCH = re.compile( - r'(?ism)
.*?') - -SUB_FILE_INFO_MATCH = re.compile( - r'(?ism)id="sub_\d+">([^<]+).*?Nota ([0-9.]+)\s+(?:dintr-un\s+?(\w+)|din\s+?([0-9]+)\s*?)[^>].*?') -SEARCH_PAGE_MATCH = re.compile( - r'(?ism)class="detalii\s[^>]{1}.+?]+?>([^<]+)\s*\((\d{4})\)') - -# helpers -def title_match(s, t, ratio_calc=False): - """ title_match: - Tries to calculate the levenshtein distance between two strings. - If ratio_calc = True, the function computes the - levenshtein distance ratio of similarity between two strings - This function is mainly copied from the Levenshtein package - """ - # Initialize matrix of zeros - rows = len(s)+1 - cols = len(t)+1 - distance = np.zeros((rows, cols), dtype=int) - - for i in range(1, rows): - for k in range(1, cols): - distance[i][0] = i - distance[0][k] = k - - # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions - for col in range(1, cols): - for row in range(1, rows): - if s[row-1] == t[col-1]: - cost = 0 - else: - # the cost of a substitution is 2 for distance the cost of a substitution is 1. - if ratio_calc: - cost = 2 - else: - cost = 1 - distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions - # Cost of insertions - distance[row][col-1] + 1, - distance[row-1][col-1] + cost) # Cost of substitutions - if ratio_calc: - ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t)) - return ratio - else: - # This is the minimum number of edits needed to convert string a to string b - return distance[row][col] - -@enum.unique -class SearchTypes(enum.Enum): - 'Search type based on video object received' - Movie = 1 - Episode = 2 - - -class RegieLiveSearchAPI(): - 'Main class that interfaces with regielive sub provider' - video = None - title = None - session = None - search_type = SearchTypes.Movie - - def __init__(self, video): - 'Constructor that needs a [Movie, Episode] object' - self.video = video - self.initialize() - - def initialize(self): - 'Instance initialization goes here' - if isinstance(self.video, Episode): - self.search_type = SearchTypes.Episode - self.title = self.video.series - else: - self.title = self.video.title - - self.session = Session() - self.session.headers.update(DEFAULT_HEADERS) - logger.debug('Initialized new RegieLiveSearchAPI with search type %s of object %s', - self.search_type, str(self.video)) - - def get_req_cookies(self): - 'Get cookies used for request' - if self.session: - return self.session.cookies - - return None - - def search_video(self): - 'Main function that should be called to get sub data back' - if self.video is None: - return None - - results = self.search_lite_api() - - if not results: - sleep(2.0) #stagger request in order to no flood the server - results = self.search_page() - - if not results or results['data'] is None: - return None # not logging since we can't get here without logging the reason elsewhere - - logger.debug(results) - found_subs = self.parse_page(results) - logger.debug(found_subs) - - return found_subs - - def parse_page(self, results): - 'fetch and parse episode/movie page' - if len(results['data']) > 1: - logger.warning("More than one page result for subtitle %s with data %s", - self.title, - str(results['data'])) - - sub_list = None - if self.search_type is SearchTypes.Movie: - sub_list = self.parse_movie_pages(results['data']) - else: - sub_list = self.parse_episode_pages(results['data']) - - return sub_list - - def parse_movie_pages(self, sub_page_data): - 'Fetch and parse movie page data' - sub_list = [] - for result in sub_page_data: - extracted_subs = self.extract_movie_sub_block( - self.get_page(result['url'], None)) - sub_data = self.parse_sub_block(extracted_subs) - if sub_data: - sub_list.extend(sub_data) - else: - logger.debug( - 'Empty results from url %s with resulted block %s', result['url'], str(sub_data)) - - return sub_list - - def parse_episode_pages(self, sub_page_data): - 'Fetch and parse episode pages' - season = SEASON_URL % self.video.season - url = '' - sub_list = [] - for result in sub_page_data: - url = urlparse.urljoin(result['url'], season) - extracted_subs = self.extract_episode_sub_block( - self.get_page(url, None)) - sub_data = self.parse_sub_block(extracted_subs) - if sub_data: - sub_list.extend(sub_data) - else: - logger.debug( - 'Empty results from url %s with resulted block %s', url, str(sub_data)) - - return sub_list - - def search_page(self): - """ - Scrape search the page for the title - This does not take into consideration pagination - since the titles should be pretty unique and this api - is not a search engine. - I will make the pagination too if this, later, turns out to be a problem - Return a similar object to the lite api in order to be consistent - """ - cache_key = sha1(CACHE_PREFIX + self.title.encode("utf-8"), usedforsecurity=False).digest() - cached_response = region.get(cache_key) - if cached_response: - logger.info("Found cached reply for search request %s", self.title) - return cached_response - - response = self.get_api_page(PAGE_SEARCH_PATH, {'s': self.title}) - data = {'error': True, 'data': []} - - if response: - m_iter = SEARCH_PAGE_MATCH.finditer(response) - if m_iter: - for m in m_iter: - data['data'].append({ - 'id': RegieLiveSearchAPI.get_id_from_url(m.group(1)), - 'text': m.group(2), - 'url': m.group(1), - 'an': m.group(3) - }) - - # could be more efficient doing this in the previous iteration - data['data'] = self.parse_json_results(data['data']) - - if data['data'] and len(data['data']) > 0: - data['error'] = False - region.set(cache_key, data) - - return data - - def search_lite_api(self): - 'Access the lite json api for info' - response = self.get_api_page(LITE_JSON_PATH, {'s': self.title}, True) - - if response is None: - logger.warning( - "Regielive lite API failed to provide a proper reply") - return None - - if response['error'] or not response['data']: - logger.warning("Regielive API responded with no results!") - logger.info(response) - return None - - response['data'] = self.parse_json_results(response['data']) - - return response - - def parse_json_results(self, data_arr): - 'Parses the results of our lite api request' - if not data_arr: - return None - - result = list(filter(self.json_result_filter, data_arr)) - - if not result: - return None - - return result - - def json_result_filter(self, element): - 'Filter function for json results' - if not element: - return False - - match_ratio = title_match(element['text'], self.title, True) - element_year = RegieLiveSearchAPI.get_safe_int(element['an']) - - #if none have valid years we'll let it match - #also RL sometimes has the year off by 1 - if abs(element_year - RegieLiveSearchAPI.get_safe_int(self.video.year)) <= 1 and match_ratio > 0.9: - return True - - logger.info("No match for title %s year %i and returned title %s year %i match ration %f", - self.title, - self.video.year, - element['text'], - element_year, - match_ratio) - return False - - def get_api_page(self, url, url_params, return_json=False): - 'request a page from RL API' - return self.get_page(urlparse.urljoin(BASE_URL, url), url_params, return_json) - - def get_page(self, url, url_params, return_json=False): - 'Request a page' - try: - req = self.session.get(url, params=url_params, - timeout=REQUEST_TIMEOUT, - allow_redirects=True) - req.raise_for_status() - - if return_json: - return req.json() - - return req.text - except req_exceptions.HTTPError as err: - logger.exception( - "Failed to request url %s\n Error %s", url, str(err)) - - return None - - def extract_movie_sub_block(self, page_html): - 'extract subtitles block from movie page' - m = SUB_PAGE_MOVIE_MATCH.search(page_html) - if m: - return m.group(1) - - logger.info("Could not find subtitle block for Movie %s", self.title) - return '' - - def extract_episode_sub_block(self, page_html): - 'extract subtitle from series page' - episode_zone_regex = SUB_PAGE_EPISODE_PATTERN % self.video.episode - m = None - try: - m = re.search(episode_zone_regex, page_html) - except Exception as err: - logger.debug(str(page_html)) - logger.exception(err) - - if m: - return m.group(1) - - logger.info("Could not find episode %i for season %i of series %s", - self.video.episode, - self.video.season, - self.title) - return '' - - def parse_sub_block(self, subs_block): - 'Parse sub block into subtitle objects' - if subs_block is None: - return None - - m_iter = SUB_FILE_INFO_MATCH.finditer(subs_block) - sub_list = [] - if m_iter: - for match in m_iter: - sub_list.append( - RegieLiveAPIData(match.group(1), match.group(5), - RegieLiveAPIRating(match.group(2), match.group(4)))) - else: - logger.debug('No subtitles matched for sub block %s of title %s', str( - subs_block), self.title) - - return sub_list - - @classmethod - def get_id_from_url(cls, url): - 'get the movie rl id from page url' - m = re.search(r'(?ms)(\d+)/', url) - if m: - return m.group(1) - - return 0 - - @classmethod - def get_safe_int(cls, value, default_value = 0): - 'returns an int from the supplied value or a default' - if value and ( isinstance(value, int) or (isinstance(value, str) and value.isdigit()) ): - return int(value) - - return default_value -