using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using NLog;
using NzbDrone.Common.Instrumentation;
using NzbDrone.Core.Parser.Model;
using NzbDrone.Core.Tv;
namespace NzbDrone.Core.Parser
{
public static class Parser
{
private static readonly Logger Logger = NzbDroneLogger.GetLogger();
private static readonly Regex[] ReportTitleRegex = new[]
{
//Episodes with airdate
new Regex(@"^(?
.+?)?\W*(?\d{4})\W+(?[0-1][0-9])\W+(?[0-3][0-9])(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Multi-Part episodes without a title (S01E05.S01E06)
new Regex(@"^(?:\W*S?(?(?\d{1,3}(?!\d+)))+){2,}(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Multi-episode Repeated (S01E05 - S01E06, 1x05 - 1x06, etc)
new Regex(@"^(?.+?)(?:(\W|_)+S?(?(?\d{1,3}(?!\d+)))+){2,}(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Episodes without a title, Single (S01E05, 1x05) AND Multi (S01E04E05, 1x04x05, etc)
new Regex(@"^(?:S?(?(?\d{2,3}(?!\d+)))+)(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Episodes with a title, Single episodes (S01E05, 1x05, etc) & Multi-episode (S01E05E06, S01E05-06, S01E05 E06, etc)
new Regex(@"^(?.+?)(?:(\W|_)+S?(?(?\d{2,3}(?!\d+)))+)\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Episodes with a title, Single episodes (S01E05, 1x05, etc) & Multi-episode (S01E05E06, S01E05-06, S01E05 E06, etc)
new Regex(@"^(?.+?)(?:\W+S?(?(?\d{2,3}(?!\d+)))+)(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Episodes with single digit episode number (S01E1, S01E5E6, etc)
new Regex(@"^(?.*?)(?:\W?S?(?(?\d{1}))+)+(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Supports 103/113 naming
new Regex(@"^(?.+?)?(?:\W?(?(?\d{2}(?!p|i|\d+)))+(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Mini-Series, treated as season 1, episodes are labelled as Part01, Part 01, Part.1
new Regex(@"^(?.+?)(?:\W+(?:(?:Part\W?|(?\d{1,2}(?!\d+)))+)(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Supports 1103/1113 naming
new Regex(@"^(?.+?)?(?:\W?(?(?\d{2}(?!p|i|\d+|\)|\]|\W\d+)))+(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Supports Season 01 Episode 03
new Regex(@"(?:.*(?:\""|^))(?.*?)(?:\W?Season\W?)(?(?(?.+?)\W(?:S|Season)\W?(?\d{1,2}(?!\d+))(\W+|_|$)(?EXTRAS|SUBPACK)?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled)
};
private static readonly Regex NormalizeRegex = new Regex(@"((^|\W|_)(a|an|the|and|or|of)($|\W|_))|\W|_|(?:(?<=[^0-9]+)|\b)(?!(?:19\d{2}|20\d{2}))\d+(?=[^0-9ip]+|\b)",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex SimpleTitleRegex = new Regex(@"480[i|p]|720[i|p]|1080[i|p]|[x|h|x\s|h\s]264|DD\W?5\W1|\<|\>|\?|\*|\:|\|",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex MultiPartCleanupRegex = new Regex(@"\(\d+\)$", RegexOptions.Compiled);
private static readonly Regex LanguageRegex = new Regex(@"(?:\W|_)(?ita|italian)|(?german\b)|(?flemish)|(?greek)|(?(?:\W|_)FR)(?:\W|_)",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex YearInTitleRegex = new Regex(@"^(?.+?)(?:\W|_)?(?\d{4})",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
public static ParsedEpisodeInfo ParsePath(string path)
{
var fileInfo = new FileInfo(path);
var result = ParseTitle(fileInfo.Name);
if (result == null)
{
Logger.Trace("Attempting to parse episode info using full path. {0}", fileInfo.FullName);
result = ParseTitle(fileInfo.FullName);
}
if (result == null)
{
Logger.Warn("Unable to parse episode info from path {0}", path);
}
return result;
}
public static ParsedEpisodeInfo ParseTitle(string title)
{
try
{
if (!ValidateBeforeParsing(title)) return null;
Logger.Trace("Parsing string '{0}'", title);
var simpleTitle = SimpleTitleRegex.Replace(title, String.Empty);
foreach (var regex in ReportTitleRegex)
{
var match = regex.Matches(simpleTitle);
if (match.Count != 0)
{
try
{
var result = ParseMatchCollection(match);
if (result != null)
{
result.Language = ParseLanguage(title);
result.Quality = QualityParser.ParseQuality(title);
return result;
}
}
catch (InvalidDateException ex)
{
Logger.TraceException(ex.Message, ex);
break;
}
}
}
}
catch (Exception e)
{
if (!title.ToLower().Contains("password") && !title.ToLower().Contains("yenc"))
Logger.ErrorException("An error has occurred while trying to parse " + title, e);
}
Logger.Trace("Unable to parse {0}", title);
return null;
}
public static string ParseSeriesName(string title)
{
Logger.Trace("Parsing string '{0}'", title);
var parseResult = ParseTitle(title);
if (parseResult == null)
{
return CleanSeriesTitle(title);
}
return parseResult.SeriesTitle;
}
public static string CleanSeriesTitle(this string title)
{
long number = 0;
//If Title only contains numbers return it as is.
if (Int64.TryParse(title, out number))
return title;
return NormalizeRegex.Replace(title, String.Empty).ToLower();
}
public static string CleanupEpisodeTitle(string title)
{
//this will remove (1),(2) from the end of multi part episodes.
return MultiPartCleanupRegex.Replace(title, string.Empty).Trim();
}
private static SeriesTitleInfo GetSeriesTitleInfo(string title)
{
var seriesTitleInfo = new SeriesTitleInfo();
seriesTitleInfo.Title = title;
var match = YearInTitleRegex.Match(title);
if (!match.Success)
{
seriesTitleInfo.TitleWithoutYear = title;
}
else
{
seriesTitleInfo.TitleWithoutYear = match.Groups["title"].Value;
seriesTitleInfo.Year = Convert.ToInt32(match.Groups["year"].Value);
}
return seriesTitleInfo;
}
private static ParsedEpisodeInfo ParseMatchCollection(MatchCollection matchCollection)
{
var seriesName = matchCollection[0].Groups["title"].Value.Replace('.', ' ');
int airYear;
Int32.TryParse(matchCollection[0].Groups["airyear"].Value, out airYear);
ParsedEpisodeInfo result;
if (airYear < 1900)
{
var seasons = new List();
foreach (Capture seasonCapture in matchCollection[0].Groups["season"].Captures)
{
int parsedSeason;
if (Int32.TryParse(seasonCapture.Value, out parsedSeason))
seasons.Add(parsedSeason);
}
//If no season was found it should be treated as a mini series and season 1
if (seasons.Count == 0)
seasons.Add(1);
//If more than 1 season was parsed go to the next REGEX (A multi-season release is unlikely)
if (seasons.Distinct().Count() > 1)
return null;
result = new ParsedEpisodeInfo
{
SeasonNumber = seasons.First(),
EpisodeNumbers = new int[0],
};
foreach (Match matchGroup in matchCollection)
{
var episodeCaptures = matchGroup.Groups["episode"].Captures.Cast().ToList();
//Allows use to return a list of 0 episodes (We can handle that as a full season release)
if (episodeCaptures.Any())
{
var first = Convert.ToInt32(episodeCaptures.First().Value);
var last = Convert.ToInt32(episodeCaptures.Last().Value);
if (first > last)
{
return null;
}
var count = last - first + 1;
result.EpisodeNumbers = Enumerable.Range(first, count).ToArray();
}
else
{
//Check to see if this is an "Extras" or "SUBPACK" release, if it is, return NULL
//Todo: Set a "Extras" flag in EpisodeParseResult if we want to download them ever
if (!String.IsNullOrWhiteSpace(matchCollection[0].Groups["extras"].Value))
return null;
result.FullSeason = true;
}
}
}
else
{
//Try to Parse as a daily show
var airmonth = Convert.ToInt32(matchCollection[0].Groups["airmonth"].Value);
var airday = Convert.ToInt32(matchCollection[0].Groups["airday"].Value);
//Swap day and month if month is bigger than 12 (scene fail)
if (airmonth > 12)
{
var tempDay = airday;
airday = airmonth;
airmonth = tempDay;
}
var airDate = new DateTime(airYear, airmonth, airday);
//Check if episode is in the future (most likely a parse error)
if (airDate > DateTime.Now.AddDays(1).Date || airDate < new DateTime(1970, 1, 1))
{
throw new InvalidDateException("Invalid date found: {0}", airDate);
}
result = new ParsedEpisodeInfo
{
AirDate = airDate.ToString(Episode.AIR_DATE_FORMAT),
};
}
result.SeriesTitle = CleanSeriesTitle(seriesName);
result.SeriesTitleInfo = GetSeriesTitleInfo(result.SeriesTitle);
Logger.Trace("Episode Parsed. {0}", result);
return result;
}
private static Language ParseLanguage(string title)
{
var lowerTitle = title.ToLower();
if (lowerTitle.Contains("english"))
return Language.English;
if (lowerTitle.Contains("french"))
return Language.French;
if (lowerTitle.Contains("spanish"))
return Language.Spanish;
if (lowerTitle.Contains("danish"))
return Language.Danish;
if (lowerTitle.Contains("dutch"))
return Language.Dutch;
if (lowerTitle.Contains("japanese"))
return Language.Japanese;
if (lowerTitle.Contains("cantonese"))
return Language.Cantonese;
if (lowerTitle.Contains("mandarin"))
return Language.Mandarin;
if (lowerTitle.Contains("korean"))
return Language.Korean;
if (lowerTitle.Contains("russian"))
return Language.Russian;
if (lowerTitle.Contains("polish"))
return Language.Polish;
if (lowerTitle.Contains("vietnamese"))
return Language.Vietnamese;
if (lowerTitle.Contains("swedish"))
return Language.Swedish;
if (lowerTitle.Contains("norwegian"))
return Language.Norwegian;
if (lowerTitle.Contains("finnish"))
return Language.Finnish;
if (lowerTitle.Contains("turkish"))
return Language.Turkish;
if (lowerTitle.Contains("portuguese"))
return Language.Portuguese;
if (lowerTitle.Contains("nlsub"))
return Language.Norwegian;
var match = LanguageRegex.Match(title);
if (match.Groups["italian"].Captures.Cast().Any())
return Language.Italian;
if (match.Groups["german"].Captures.Cast().Any())
return Language.German;
if (match.Groups["flemish"].Captures.Cast().Any())
return Language.Flemish;
if (match.Groups["greek"].Captures.Cast().Any())
return Language.Greek;
if (match.Groups["french"].Success)
return Language.French;
return Language.English;
}
private static bool ValidateBeforeParsing(string title)
{
if (title.ToLower().Contains("password") && title.ToLower().Contains("yenc"))
{
Logger.Trace("");
return false;
}
if (!title.Any(Char.IsLetterOrDigit) || (!title.Any(Char.IsPunctuation) && !title.Any(Char.IsWhiteSpace)))
{
return false;
}
return true;
}
}
}