You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readarr/NzbDrone.Core/Parser.cs

432 lines
22 KiB

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using NLog;
using NzbDrone.Core.Model;
using NzbDrone.Core.Repository.Quality;
namespace NzbDrone.Core
{
public static class Parser
{
private static readonly Logger Logger = LogManager.GetCurrentClassLogger();
private static readonly Regex[] ReportTitleRegex = new[]
{
//Episodes with airdate
new Regex(@"^(?<title>.+?)?\W*(?<airyear>\d{4})\W+(?<airmonth>\d{2})\W+(?<airday>\d{2})\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Multi-Part episodes without a title (S01E05.S01E06)
new Regex(@"^(?:\W*S?(?<season>\d{1,2}(?!\d+))(?:(?:\-|[ex]|\s){1,2}(?<episode>\d{1,2}(?!\d+)))+){2,}\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Multi-episode (S01E05E06, S01E05-06, etc)
new Regex(@"^(?<title>.+?)(?:\W+S?(?<season>\d{1,2}(?!\d+))(?:(?:\-|[ex]|\s){1,2}(?<episode>\d{2}(?!\d+)))+){2,}\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Single episodes (S01E05, 1x05, etc)
new Regex(@"^(?<title>.+?)(?:\W+S?(?<season>\d{1,2}(?!\d+))(?:(?:\-|[ex]|\s){1,2}(?<episode>\d{2}(?!\d+)))+)\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Mini-Series, treated as season 1, episodes are labeled as Part01, Part 01, Part.1
new Regex(@"^(?<title>.+?)(?:\W+(?:(?:Part\W?|(?<!\d+\W+)e)(?<episode>\d{1,2}(?!\d+)))+)\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Episodes without a title, Single (S01E05, 1x05) AND Multi (S01E04E05, 1x04x05)
new Regex(@"^(?:S?(?<season>\d{1,2}(?!\d+))(?:(?:\-|[ex]|\s)(?<episode>\d{1,2}(?!\d+)))+\W*)+\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Episodes over 99 (3-digits or more)
new Regex(@"^(?<title>.*?)(?:\W?S?(?<season>\d{1,2}(?!\d+))(?:(?:\-|[ex]|\s){1,2}(?<episode>\d+))+)+\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Supports 1103/1113 naming
new Regex(@"^(?<title>.+?)?(?:\W?(?<season>(?<!\d+|\(|\[)\d{2})(?<episode>\d{2}(?!p|i|\d+|\)|\])))+\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Supports 103/113 naming
new Regex(@"^(?<title>.+?)?(?:\W?(?<season>(?<!\d+)\d{1})(?<episode>\d{2}(?!p|i|\d+)))+\W?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
//Supports Season only releases
new Regex(@"^(?<title>.+?)\W(?:S|Season)\W?(?<season>\d{1,2}(?!\d+))\W?(?<extras>EXTRAS|SUBPACK)?(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled)
};
private static readonly Regex NormalizeRegex = new Regex(@"((^|\W)(a|an|the|and|or|of)($|\W))|\W|(?:(?<=[^0-9]+)|\b)(?!(?:19\d{2}|20\d{2}))\d+(?=[^0-9ip]+|\b)",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex SimpleTitleRegex = new Regex(@"480[i|p]|720[i|p]|1080[i|p]|[x|h|x\s|h\s]264|DD\W?5\W1|\<|\>|\?|\*|\:|\||""",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex ReportSizeRegex = new Regex(@"(?<value>\d+\.\d{1,2}|\d+\,\d+\.\d{1,2})\W?(?<unit>GB|MB|GiB|MiB)",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
internal static EpisodeParseResult ParsePath(string path)
{
var fileInfo = new FileInfo(path);
var result = ParseTitle(fileInfo.Name);
if (result == null)
{
Logger.Trace("Attempting to parse episode info using full path. {0}", fileInfo.FullName);
result = ParseTitle(fileInfo.FullName);
}
return result;
}
internal static EpisodeParseResult ParseTitle(string title)
{
try
{
Logger.Trace("Parsing string '{0}'", title);
var simpleTitle = SimpleTitleRegex.Replace(title, String.Empty);
foreach (var regex in ReportTitleRegex)
{
var match = regex.Matches(simpleTitle);
if (match.Count != 0)
{
var result = ParseMatchCollection(match);
if (result != null)
{
//Check if episode is in the future (most likley a parse error)
if (result.AirDate > DateTime.Now.AddDays(1).Date)
break;
result.Language = ParseLanguage(title);
result.Quality = ParseQuality(title);
return result;
}
}
}
Logger.Warn("Unable to parse episode info. {0}", title);
}
catch (Exception e)
{
Logger.Error("An error has occurred while trying to parse '{0}'", title);
}
return null;
}
private static EpisodeParseResult ParseMatchCollection(MatchCollection matchCollection)
{
var seriesName = matchCollection[0].Groups["title"].Value;
int airyear;
Int32.TryParse(matchCollection[0].Groups["airyear"].Value, out airyear);
EpisodeParseResult parsedEpisode;
if (airyear < 1900)
{
var seasons = new List<int>();
foreach (Capture seasonCapture in matchCollection[0].Groups["season"].Captures)
{
int parsedSeason;
if (Int32.TryParse(seasonCapture.Value, out parsedSeason))
seasons.Add(parsedSeason);
}
//If no season was found it should be treated as a mini series and season 1
if (seasons.Count == 0)
seasons.Add(1);
//If more than 1 season was parsed go to the next REGEX (A multi-season release is unlikely)
if (seasons.Distinct().Count() > 1)
return null;
parsedEpisode = new EpisodeParseResult
{
SeasonNumber = seasons.First(),
EpisodeNumbers = new List<int>()
};
foreach (Match matchGroup in matchCollection)
{
var episodeCaptures = matchGroup.Groups["episode"].Captures.Cast<Capture>().ToList();
//Allows use to return a list of 0 episodes (We can handle that as a full season release)
if (episodeCaptures.Any())
{
var first = Convert.ToInt32(episodeCaptures.First().Value);
var last = Convert.ToInt32(episodeCaptures.Last().Value);
parsedEpisode.EpisodeNumbers = Enumerable.Range(first, last - first + 1).ToList();
}
else
{
//Check to see if this is an "Extras" or "SUBPACK" release, if it is, return NULL
//Todo: Set a "Extras" flag in EpisodeParseResult if we want to download them ever
if (!String.IsNullOrWhiteSpace(matchCollection[0].Groups["extras"].Value))
return null;
parsedEpisode.FullSeason = true;
}
}
}
else
{
//Try to Parse as a daily show
var airmonth = Convert.ToInt32(matchCollection[0].Groups["airmonth"].Value);
var airday = Convert.ToInt32(matchCollection[0].Groups["airday"].Value);
parsedEpisode = new EpisodeParseResult
{
AirDate = new DateTime(airyear, airmonth, airday).Date,
};
}
parsedEpisode.SeriesTitle = seriesName;
Logger.Trace("Episode Parsed. {0}", parsedEpisode);
return parsedEpisode;
}
public static string ParseSeriesName(string title)
{
Logger.Trace("Parsing string '{0}'", title);
foreach (var regex in ReportTitleRegex)
{
var match = regex.Matches(title);
if (match.Count != 0)
{
var seriesName = NormalizeTitle(match[0].Groups["title"].Value);
Logger.Trace("Series Parsed. {0}", seriesName);
return seriesName;
}
}
return NormalizeTitle(title);
}
internal static Quality ParseQuality(string name)
{
Logger.Trace("Trying to parse quality for {0}", name);
name = name.Trim();
var normalizedName = NormalizeTitle(name);
var result = new Quality { QualityType = QualityTypes.Unknown };
result.Proper = normalizedName.Contains("proper");
if (normalizedName.Contains("dvd") || normalizedName.Contains("bdrip") || normalizedName.Contains("brrip"))
{
result.QualityType = QualityTypes.DVD;
return result;
}
if (normalizedName.Contains("xvid") || normalizedName.Contains("divx"))
{
if (normalizedName.Contains("bluray"))
{
result.QualityType = QualityTypes.DVD;
return result;
}
result.QualityType = QualityTypes.SDTV;
return result;
}
if (normalizedName.Contains("bluray"))
{
if (normalizedName.Contains("720p"))
{
result.QualityType = QualityTypes.Bluray720p;
return result;
}
if (normalizedName.Contains("1080p"))
{
result.QualityType = QualityTypes.Bluray1080p;
return result;
}
result.QualityType = QualityTypes.Bluray720p;
return result;
}
if (normalizedName.Contains("webdl"))
{
result.QualityType = QualityTypes.WEBDL;
return result;
}
if (normalizedName.Contains("x264") || normalizedName.Contains("h264") || normalizedName.Contains("720p"))
{
result.QualityType = QualityTypes.HDTV;
return result;
}
//Based on extension
if (result.QualityType == QualityTypes.Unknown)
{
try
{
switch (Path.GetExtension(name).ToLower())
{
case ".avi":
case ".xvid":
case ".divx":
case ".wmv":
case ".mp4":
case ".mpg":
case ".mpeg":
case ".mov":
case ".rm":
case ".rmvb":
case ".flv":
case ".dvr-ms":
case ".ogm":
{
result.QualityType = QualityTypes.SDTV;
break;
}
case ".mkv":
case ".ts":
{
result.QualityType = QualityTypes.HDTV;
break;
}
}
}
catch (ArgumentException)
{
//Swallow exception for cases where string contains illegal
//path characters.
}
}
if (name.Contains("[HDTV]"))
{
result.QualityType = QualityTypes.HDTV;
return result;
}
if ((normalizedName.Contains("sdtv") ||
(result.QualityType == QualityTypes.Unknown && normalizedName.Contains("hdtv"))) &&
!normalizedName.Contains("mpeg"))
{
result.QualityType = QualityTypes.SDTV;
return result;
}
return result;
}
internal static LanguageType ParseLanguage(string title)
{
var lowerTitle = title.ToLower();
if (lowerTitle.Contains("english"))
return LanguageType.English;
if (lowerTitle.Contains("french"))
return LanguageType.French;
if (lowerTitle.Contains("spanish"))
return LanguageType.Spanish;
if (lowerTitle.Contains("german"))
{
//Make sure it doesn't contain Germany (Since we're not using REGEX for all this)
if (!lowerTitle.Contains("germany"))
return LanguageType.German;
}
if (lowerTitle.Contains("italian"))
return LanguageType.Italian;
if (lowerTitle.Contains("danish"))
return LanguageType.Danish;
if (lowerTitle.Contains("dutch"))
return LanguageType.Dutch;
if (lowerTitle.Contains("japanese"))
return LanguageType.Japanese;
if (lowerTitle.Contains("cantonese"))
return LanguageType.Cantonese;
if (lowerTitle.Contains("mandarin"))
return LanguageType.Mandarin;
if (lowerTitle.Contains("korean"))
return LanguageType.Korean;
if (lowerTitle.Contains("russian"))
return LanguageType.Russian;
if (lowerTitle.Contains("polish"))
return LanguageType.Polish;
if (lowerTitle.Contains("vietnamese"))
return LanguageType.Vietnamese;
if (lowerTitle.Contains("swedish"))
return LanguageType.Swedish;
if (lowerTitle.Contains("norwegian"))
return LanguageType.Norwegian;
if (lowerTitle.Contains("finnish"))
return LanguageType.Finnish;
if (lowerTitle.Contains("turkish"))
return LanguageType.Turkish;
if (lowerTitle.Contains("portuguese"))
return LanguageType.Portuguese;
return LanguageType.English;
}
/// <summary>
/// Normalizes the title. removing all non-word characters as well as common tokens
/// such as 'the' and 'and'
/// </summary>
/// <param name = "title">title</param>
/// <returns></returns>
public static string NormalizeTitle(string title)
{
long number = 0;
//If Title only contains numbers return it as is.
if (Int64.TryParse(title, out number))
return title;
return NormalizeRegex.Replace(title, String.Empty).ToLower();
}
public static long GetReportSize(string sizeString)
{
var match = ReportSizeRegex.Matches(sizeString);
if (match.Count != 0)
{
var value = Convert.ToDecimal(Regex.Replace(match[0].Groups["value"].Value, "\\,", ""));
var unit = match[0].Groups["unit"].Value;
if (unit.Equals("MB", StringComparison.InvariantCultureIgnoreCase) || unit.Equals("MiB", StringComparison.InvariantCultureIgnoreCase))
return Convert.ToInt64(value * 1048576L);
if (unit.Equals("GB", StringComparison.InvariantCultureIgnoreCase) || unit.Equals("GiB", StringComparison.InvariantCultureIgnoreCase))
return Convert.ToInt64(value * 1073741824L);
}
return 0;
}
}
}