recyclarr/src/app/guide/sonarr/profile.py

import re
from collections import defaultdict
from enum import Enum
import requests

from app.profile_data import ProfileData

TermCategory = Enum('TermCategory', 'Preferred Required Ignored')

header_regex = re.compile(r'^(#+)\s([\w\s\d]+)\s*$')
score_regex = re.compile(r'score.*?\[(-?[\d]+)\]', re.IGNORECASE)
header_release_profile_regex = re.compile(r'release profile', re.IGNORECASE)
category_regex = (
    (TermCategory.Required, re.compile(r'must contain', re.IGNORECASE)),
    (TermCategory.Ignored, re.compile(r'must not contain', re.IGNORECASE)),
    (TermCategory.Preferred, re.compile(r'preferred', re.IGNORECASE)),
)

class ParserState:
    def __init__(self):
        self.profile_name = None
        self.score = None
        self.current_category = TermCategory.Preferred
        self.bracket_depth = 0
        self.current_header_depth = -1

    def reset(self):
        self.__init__()

    def is_valid(self):
        return \
            self.profile_name is not None and \
            self.current_category is not None and \
            (self.current_category != TermCategory.Preferred or self.score is not None)

# --------------------------------------------------------------------------------------------------
def get_markdown(page):
    response = requests.get(f'https://raw.githubusercontent.com/TRaSH-/Guides/master/docs/Sonarr/V3/{page}.md')
    return response.content.decode('utf8')

# --------------------------------------------------------------------------------------------------
def parse_category(line):
    for rx in category_regex:
        if rx[1].search(line):
            return rx[0]

    return None

# --------------------------------------------------------------------------------------------------
def parse_markdown_outside_fence(args, logger, line, state, results):
    # Header processing
    if match := header_regex.search(line):
        header_depth = len(match.group(1))
        header_text = match.group(2)
        logger.debug(f'> Parsing Header [Text: {header_text}] [Depth: {header_depth}]')

        # Profile name (always reset previous state here)
        if header_release_profile_regex.search(header_text):
            state.reset()
            state.profile_name = header_text
            logger.debug(f'  - New Profile [Text: {header_text}]')
            return

        elif header_depth <= state.current_header_depth:
            logger.debug('  - !! Non-nested, non-profile header found; resetting all state')
            state.reset()
            return

    # Until we find a header that defines a profile, we don't care about anything under it.
    if not state.profile_name:
        return

    # Check if we are enabling the "Include Preferred when Renaming" checkbox
    profile = results[state.profile_name]
    lower_line = line.lower()
    if 'include preferred' in lower_line:
        profile.include_preferred_when_renaming = 'not' not in lower_line
        logger.debug(f'  - "Include Preferred" found [Value: {profile.include_preferred_when_renaming}] [Line: {line}]')
        return

    # Either we have a nested header or normal line at this point
    # We need to check if we're defining a new category.
    if category := parse_category(line):
        state.current_category = category
        logger.debug(f'  - Category Set [Name: {category}] [Line: {line}]')
        # DO NOT RETURN HERE!
        # The category and score are sometimes in the same sentence (line); continue processing the line!!
        # return

    # Check this line for a score value. We do this even if our category may not be set to 'Preferred' yet.
    if match := score_regex.search(line):
        state.score = int(match.group(1))
        logger.debug(f'  - Score [Value: {state.score}]')
        return

# --------------------------------------------------------------------------------------------------
def parse_markdown_inside_fence(args, logger, line, state, results):
    profile = results[state.profile_name]

    if state.current_category == TermCategory.Preferred:
        logger.debug('    + Capture Term '
                     f'[Category: {state.current_category}] '
                     f'[Score: {state.score}] '
                     f'[Strict: {args.strict_negative_scores}] '
                     f'[Term: {line}]')

        if args.strict_negative_scores and state.score < 0:
            profile.ignored.append(line)
        else:
            profile.preferred[state.score].append(line)
        return

    # Sometimes a comma is present at the end of these regexes, because when it's
    # pasted into Sonarr it acts as a delimiter. However, when using them with the
    # API we do not need them.
    line = line.rstrip(',')

    if state.current_category == TermCategory.Ignored:
        profile.ignored.append(line)
        logger.debug(f'    + Capture Term [Category: {state.current_category}] [Term: {line}]')
        return

    if state.current_category == TermCategory.Required:
        profile.required.append(line)
        logger.debug(f'    + Capture Term [Category: {state.current_category}] [Term: {line}]')
        return

# --------------------------------------------------------------------------------------------------
def parse_markdown(args, logger, markdown_content):
    results = defaultdict(ProfileData)
    state = ParserState()

    for line in markdown_content.splitlines():
        # Always check if we're starting a fenced code block. Whether we are inside one or not greatly affects
        # the logic we use.
        if line.startswith('```'):
            state.bracket_depth = 1 - state.bracket_depth
            continue

        # Not inside brackets
        if state.bracket_depth == 0:
            parse_markdown_outside_fence(args, logger, line, state, results)
        # Inside brackets
        elif state.bracket_depth == 1:
            if not state.is_valid():
                logger.debug('  - !! Inside bracket with invalid state; skipping! '
                             f'[Profile Name: {state.profile_name}] '
                             f'[Category: {state.current_category}] '
                             f'[Score: {state.score}] '
                             f'[Line: {line}] '
                )
            else:
                parse_markdown_inside_fence(args, logger, line, state, results)

    logger.debug('\n')
    return results