@ -21,8 +21,6 @@ from subzero.language import Language
logger = logging . getLogger ( __name__ )
show_cells_re = re . compile ( b ' <td class= " (?:version|vr) " >.*?</td> ' , re . DOTALL )
#: Series header parsing regex
series_year_re = re . compile ( r ' ^(?P<series>[ \ w \' .:(),*&!?-]+?)(?: \ ((?P<year> \ d {4} ) \ ))?$ ' )
@ -232,34 +230,29 @@ class Addic7edProvider(_Addic7edProvider):
logger . info ( ' Getting show ids ' )
region . set ( self . last_show_ids_fetch_key , datetime . datetime . now ( ) )
r = self . session . get ( self . server_url + ' shows.php ' , timeout = 10 )
r = self . session . get ( self . server_url , timeout = 10 )
r . raise_for_status ( )
# LXML parser seems to fail when parsing Addic7ed.com HTML markup.
# Last known version to work properly is 3.6.4 (next version, 3.7.0, fails)
# Assuming the site's markup is bad, and stripping it down to only contain what's needed.
show_cells = re . findall ( show_cells_re , r . content )
if show_cells :
soup = ParserBeautifulSoup ( b ' ' . join ( show_cells ) . decode ( ' utf-8 ' , ' ignore ' ) , [ ' lxml ' , ' html.parser ' ] )
else :
# If RegEx fails, fall back to original r.text and use 'html.parser'
soup = ParserBeautifulSoup ( r . text , [ ' html.parser ' ] )
soup = ParserBeautifulSoup ( r . content . decode ( ' utf-8 ' , ' ignore ' ) , [ ' lxml ' , ' html.parser ' ] )
# populate the show ids
show_ids = { }
shows = soup . select( ' td > h3 > a[href^= " /show/ " ] ' )
shows = soup . find ( id = ' qsShow ' )
for show in shows :
show_clean = sanitize ( show . text , default_characters = self . sanitize_characters )
try :
show_id = int ( show [ ' href ' ] [ 6 : ] )
except ValueError :
continue
if hasattr ( show , ' attrs ' ) :
try :
show_id = int ( show . attrs [ ' value ' ] )
except ValueError :
continue
if show_id != 0 :
show_clean = sanitize ( show . text , default_characters = self . sanitize_characters )
show_ids [ show_clean ] = show_id
match = series_year_re . match ( show_clean )
if match and match . group ( 2 ) and match . group ( 1 ) not in show_ids :
# year found, also add it without year
show_ids [ match . group ( 1 ) ] = show_id
show_ids [ show_clean ] = show_id
match = series_year_re . match ( show_clean )
if match and match . group ( 2 ) and match . group ( 1 ) not in show_ids :
# year found, also add it without year
show_ids [ match . group ( 1 ) ] = show_id
soup . decompose ( )
soup = None