Fixed: (RuTracker/Toloka) Clean title

pull/1419/head
Bogdan 2 years ago
parent f99a2e1164
commit 8b898733ab

@ -2,7 +2,6 @@ using System;
using System.Collections.Generic;
using System.Collections.Specialized;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
@ -25,10 +24,13 @@ namespace NzbDrone.Core.Indexers.Definitions
public class RuTracker : TorrentIndexerBase<RuTrackerSettings>
{
public override string Name => "RuTracker";
public override string[] IndexerUrls => new[] { "https://rutracker.org/", "https://rutracker.net/" };
private string LoginUrl => Settings.BaseUrl + "forum/login.php";
public override string[] IndexerUrls => new[]
{
"https://rutracker.org/",
"https://rutracker.net/"
};
public override string Description => "RuTracker is a Semi-Private Russian torrent site with a thriving file-sharing community";
public override string Language => "ru-org";
public override string Language => "ru-RU";
public override Encoding Encoding => Encoding.GetEncoding("windows-1251");
public override DownloadProtocol Protocol => DownloadProtocol.Torrent;
public override IndexerPrivacy Privacy => IndexerPrivacy.SemiPrivate;
@ -51,21 +53,24 @@ namespace NzbDrone.Core.Indexers.Definitions
protected override async Task DoLogin()
{
var requestBuilder = new HttpRequestBuilder(LoginUrl)
var loginUrl = $"{Settings.BaseUrl}forum/login.php";
var requestBuilder = new HttpRequestBuilder(loginUrl)
{
LogResponseContent = true,
AllowAutoRedirect = true,
Method = HttpMethod.Post
AllowAutoRedirect = true
};
var cookies = Cookies;
Cookies = null;
var authLoginRequest = requestBuilder
var authLoginRequest = requestBuilder.Post()
.AddFormParameter("login_username", Settings.Username)
.AddFormParameter("login_password", Settings.Password)
.AddFormParameter("login", "Login")
.AddFormParameter("redirect", "index.php")
.SetHeader("Content-Type", "application/x-www-form-urlencoded")
.SetHeader("Referer", loginUrl)
.Build();
var response = await ExecuteAuth(authLoginRequest);
@ -1416,21 +1421,6 @@ namespace NzbDrone.Core.Indexers.Definitions
return caps;
}
public override object RequestAction(string action, IDictionary<string, string> query)
{
if (action == "getUrls")
{
var links = IndexerUrls;
return new
{
options = links.Select(d => new { Value = d, Name = d })
};
}
return null;
}
}
public class RuTrackerRequestGenerator : IIndexerRequestGenerator
@ -1446,16 +1436,14 @@ namespace NzbDrone.Core.Indexers.Definitions
private IEnumerable<IndexerRequest> GetPagedRequests(string term, int[] categories, int season = 0)
{
var searchUrl = $"{_settings.BaseUrl.TrimEnd('/')}/forum/tracker.php";
var queryCollection = new NameValueCollection();
var parameters = new NameValueCollection();
var searchString = term;
// if the search string is empty use the getnew view
if (searchString.IsNullOrWhiteSpace())
{
queryCollection.Add("nm", searchString);
parameters.Set("nm", searchString);
}
else
{
@ -1466,19 +1454,28 @@ namespace NzbDrone.Core.Indexers.Definitions
searchString += " Сезон: " + season;
}
queryCollection.Add("nm", searchString);
parameters.Set("nm", searchString);
}
if (categories != null && categories.Length > 0)
{
queryCollection.Add("f", string.Join(",", _capabilities.Categories.MapTorznabCapsToTrackers(categories)));
parameters.Set("f", string.Join(",", _capabilities.Categories.MapTorznabCapsToTrackers(categories)));
}
searchUrl = searchUrl + "?" + queryCollection.GetQueryString();
var searchUrl = $"{_settings.BaseUrl}forum/tracker.php";
var request = new IndexerRequest(searchUrl, HttpAccept.Html);
if (parameters.Count > 0)
{
searchUrl += $"?{parameters.GetQueryString()}";
}
request.HttpRequest.AllowAutoRedirect = false;
var request = new IndexerRequest(searchUrl, HttpAccept.Html)
{
HttpRequest =
{
AllowAutoRedirect = false
}
};
yield return request;
}
@ -1542,6 +1539,8 @@ namespace NzbDrone.Core.Indexers.Definitions
private readonly RuTrackerSettings _settings;
private readonly IndexerCapabilitiesCategories _categories;
private readonly RuTrackerTitleParser _titleParser = new ();
public RuTrackerParser(RuTrackerSettings settings, IndexerCapabilitiesCategories categories)
{
_settings = settings;
@ -1578,12 +1577,12 @@ namespace NzbDrone.Core.Indexers.Definitions
return null;
}
var link = _settings.BaseUrl + "forum/" + qDownloadLink.GetAttribute("href");
var qDetailsLink = row.QuerySelector("td.t-title-col > div.t-title > a.tLink");
var details = _settings.BaseUrl + "forum/" + qDetailsLink.GetAttribute("href");
var infoUrl = _settings.BaseUrl + "forum/" + qDetailsLink.GetAttribute("href");
var downloadUrl = _settings.BaseUrl + "forum/" + qDownloadLink.GetAttribute("href");
var category = GetCategoryOfRelease(row);
var title = qDetailsLink.TextContent.Trim();
var categories = GetCategoryOfRelease(row);
var size = GetSizeOfRelease(row);
@ -1596,139 +1595,224 @@ namespace NzbDrone.Core.Indexers.Definitions
var release = new TorrentInfo
{
MinimumRatio = 1,
MinimumSeedTime = 0,
Title = qDetailsLink.TextContent,
InfoUrl = details,
DownloadUrl = link,
Guid = details,
Guid = infoUrl,
InfoUrl = infoUrl,
DownloadUrl = downloadUrl,
Title = _titleParser.Parse(title, categories, _settings.RussianLetters, _settings.MoveFirstTagsToEndOfReleaseTitle, _settings.MoveAllTagsToEndOfReleaseTitle),
Description = title,
Categories = categories,
Size = size,
Seeders = seeders,
Peers = leechers + seeders,
Grabs = grabs,
PublishDate = publishDate,
Categories = category,
DownloadVolumeFactor = 1,
UploadVolumeFactor = 1
UploadVolumeFactor = 1,
MinimumRatio = 1,
MinimumSeedTime = 0
};
// TODO finish extracting release variables to simplify release initialization
if (IsAnyTvCategory(release.Categories))
return release;
}
private int GetSeedersOfRelease(in IElement row)
{
var seeders = 0;
var qSeeders = row.QuerySelector("td:nth-child(7)");
if (qSeeders != null && !qSeeders.TextContent.Contains("дн"))
{
// extract season and episodes
// should also handle multi-season releases listed as Сезон: 1-8 and Сезоны: 1-8
var regex = new Regex(@".+\/\s([^а-яА-я\/]+)\s\/.+Сезон.\s*[:]*\s+(\d*\-?\d*).+(?:Серии|Эпизод)+\s*[:]*\s+(\d+-?\d*).+(\[.*\])[\s]?(.*)");
var title = regex.Replace(release.Title, "$1 - S$2E$3 - rus $4 $5");
title = Regex.Replace(title, "-Rip", "Rip", RegexOptions.IgnoreCase);
title = Regex.Replace(title, "WEB-DLRip", "WEBDL", RegexOptions.IgnoreCase);
title = Regex.Replace(title, "WEB-DL", "WEBDL", RegexOptions.IgnoreCase);
title = Regex.Replace(title, "HDTVRip", "HDTV", RegexOptions.IgnoreCase);
title = Regex.Replace(title, "Кураж-Бамбей", "kurazh", RegexOptions.IgnoreCase);
release.Title = title;
var seedersString = qSeeders.QuerySelector("b").TextContent;
if (!string.IsNullOrWhiteSpace(seedersString))
{
seeders = ParseUtil.CoerceInt(seedersString);
}
}
else if (IsAnyMovieCategory(release.Categories))
return seeders;
}
private ICollection<IndexerCategory> GetCategoryOfRelease(in IElement row)
{
var forum = row.QuerySelector("td.f-name-col > div.f-name > a")?.GetAttribute("href");
var cat = ParseUtil.GetArgumentFromQueryString(forum, "f");
return _categories.MapTrackerCatToNewznab(cat);
}
private long GetSizeOfRelease(in IElement row)
{
return ParseUtil.GetBytes(row.QuerySelector("td.tor-size").GetAttribute("data-ts_text"));
}
private DateTime GetPublishDateOfRelease(in IElement row)
{
return DateTimeUtil.UnixTimestampToDateTime(long.Parse(row.QuerySelector("td:nth-child(10)").GetAttribute("data-ts_text")));
}
public Action<IDictionary<string, string>, DateTime?> CookiesUpdater { get; set; }
}
public class RuTrackerTitleParser
{
private static readonly List<Regex> FindTagsInTitlesRegexList = new ()
{
new Regex(@"\((?>\((?<c>)|[^()]+|\)(?<-c>))*(?(c)(?!))\)"),
new Regex(@"\[(?>\[(?<c>)|[^\[\]]+|\](?<-c>))*(?(c)(?!))\]")
};
private readonly Regex _stripCyrillicRegex = new (@"(\([\p{IsCyrillic}\W]+\))|(^[\p{IsCyrillic}\W\d]+\/ )|([\p{IsCyrillic} \-]+,+)|([\p{IsCyrillic}]+)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleCommaRegex = new (@"\s(\d+),(\d+)", RegexOptions.Compiled);
private readonly Regex _tvTitleCyrillicXRegex = new (@"([\s-])Х+([\s\)\]])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleRusSeasonEpisodeOfRegex = new (@"Сезон\s*[:]*\s+(\d+).+(?:Серии|Эпизод|Выпуски)+\s*[:]*\s+(\d+(?:-\d+)?)\s*из\s*([\w?])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleRusSeasonEpisodeRegex = new (@"Сезон\s*[:]*\s+(\d+).+(?:Серии|Эпизод|Выпуски)+\s*[:]*\s+(\d+(?:-\d+)?)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleRusSeasonRegex = new (@"Сезон\s*[:]*\s+(\d+(?:-\d+)?)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleRusEpisodeOfRegex = new (@"(?:Серии|Эпизод|Выпуски)+\s*[:]*\s+(\d+(?:-\d+)?)\s*из\s*([\w?])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleRusEpisodeRegex = new (@"(?:Серии|Эпизод|Выпуски)+\s*[:]*\s+(\d+(?:-\d+)?)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
public string Parse(string title, ICollection<IndexerCategory> categories, bool stripCyrillicLetters = true, bool moveFirstTagsToEndOfReleaseTitle = false, bool moveAllTagsToEndOfReleaseTitle = false)
{
// https://www.fileformat.info/info/unicode/category/Pd/list.htm
title = Regex.Replace(title, @"\p{Pd}", "-", RegexOptions.Compiled | RegexOptions.IgnoreCase);
// replace double 4K quality in title
title = Regex.Replace(title, @"\b(2160p), 4K\b", "$1", RegexOptions.Compiled | RegexOptions.IgnoreCase);
if (IsAnyTvCategory(categories))
{
title = _tvTitleCommaRegex.Replace(title, " $1-$2");
title = _tvTitleCyrillicXRegex.Replace(title, "$1XX$2");
title = _tvTitleRusSeasonEpisodeOfRegex.Replace(title, "S$1E$2 of $3");
title = _tvTitleRusSeasonEpisodeRegex.Replace(title, "S$1E$2");
title = _tvTitleRusSeasonRegex.Replace(title, "S$1");
title = _tvTitleRusEpisodeOfRegex.Replace(title, "E$1 of $2");
title = _tvTitleRusEpisodeRegex.Replace(title, "E$1");
}
else if (IsAnyMovieCategory(categories))
{
// Bluray quality fix: radarr parse Blu-ray Disc as Bluray-1080p but should be BR-DISK
release.Title = Regex.Replace(release.Title, "Blu-ray Disc", "BR-DISK", RegexOptions.IgnoreCase);
title = Regex.Replace(title, @"\bBlu-ray Disc\b", "BR-DISK", RegexOptions.Compiled | RegexOptions.IgnoreCase);
}
if (IsAnyTvCategory(release.Categories) | IsAnyMovieCategory(release.Categories))
if (IsAnyTvCategory(categories) | IsAnyMovieCategory(categories))
{
// remove director's name from title
// rutracker movies titles look like: russian name / english name (russian director / english director) other stuff
// Ирландец / The Irishman (Мартин Скорсезе / Martin Scorsese) [2019, США, криминал, драма, биография, WEB-DL 1080p] Dub (Пифагор) + MVO (Jaskier) + AVO (Юрий Сербин) + Sub Rus, Eng + Original Eng
// this part should be removed: (Мартин Скорсезе / Martin Scorsese)
//var director = new Regex(@"(\([А-Яа-яЁё\W]+)\s/\s(.+?)\)");
var director = new Regex(@"(\([А-Яа-яЁё\W].+?\))");
release.Title = director.Replace(release.Title, "");
title = Regex.Replace(title, @"(\([\p{IsCyrillic}\W]+)\s/\s(.+?)\)", string.Empty, RegexOptions.Compiled | RegexOptions.IgnoreCase);
// Remove VO, MVO and DVO from titles
var vo = new Regex(@".VO\s\(.+?\)");
release.Title = vo.Replace(release.Title, "");
title = vo.Replace(title, string.Empty);
// Remove R5 and (R5) from release names
var r5 = new Regex(@"(.*)(.R5.)(.*)");
release.Title = r5.Replace(release.Title, "$1");
title = r5.Replace(title, "$1");
// Remove Sub languages from release names
var sub = new Regex(@"(Sub.*\+)|(Sub.*$)");
release.Title = sub.Replace(release.Title, "");
title = Regex.Replace(title, @"(\bSub\b.*$|\b[\+]*Sub[\+]*\b)", string.Empty);
// language fix: all rutracker releases contains russian track
if (release.Title.IndexOf("rus", StringComparison.OrdinalIgnoreCase) < 0)
if (title.IndexOf("rus", StringComparison.OrdinalIgnoreCase) < 0)
{
release.Title += " rus";
}
// remove russian letters
if (_settings.RussianLetters == true)
{
//Strip russian letters
var rusRegex = new Regex(@"(\([А-Яа-яЁё\W]+\))|(^[А-Яа-яЁё\W\d]+\/ )|([а-яА-ЯЁё \-]+,+)|([а-яА-ЯЁё]+)");
release.Title = rusRegex.Replace(release.Title, "");
// Replace everything after first forward slash with a year (to avoid filtering away releases with an fwdslash after title+year, like: Title Year [stuff / stuff])
var fwdslashRegex = new Regex(@"(\/\s.+?\[)");
release.Title = fwdslashRegex.Replace(release.Title, "[");
title += " rus";
}
}
return release;
}
if (stripCyrillicLetters)
{
title = _stripCyrillicRegex.Replace(title, string.Empty).Trim(' ', '-');
}
private int GetSeedersOfRelease(in IElement row)
{
var seeders = 0;
var qSeeders = row.QuerySelector("td:nth-child(7)");
if (qSeeders != null && !qSeeders.TextContent.Contains("дн"))
if (moveAllTagsToEndOfReleaseTitle)
{
var seedersString = qSeeders.QuerySelector("b").TextContent;
if (!string.IsNullOrWhiteSpace(seedersString))
{
seeders = ParseUtil.CoerceInt(seedersString);
}
title = MoveAllTagsToEndOfReleaseTitle(title);
}
else if (moveFirstTagsToEndOfReleaseTitle)
{
title = MoveFirstTagsToEndOfReleaseTitle(title);
}
return seeders;
}
title = Regex.Replace(title, @"\b-Rip\b", "Rip", RegexOptions.Compiled | RegexOptions.IgnoreCase);
title = Regex.Replace(title, @"\bHDTVRip\b", "HDTV", RegexOptions.Compiled | RegexOptions.IgnoreCase);
title = Regex.Replace(title, @"\bWEB-DLRip\b", "WEB-DL", RegexOptions.Compiled | RegexOptions.IgnoreCase);
title = Regex.Replace(title, @"\bWEBDLRip\b", "WEB-DL", RegexOptions.Compiled | RegexOptions.IgnoreCase);
title = Regex.Replace(title, @"\bWEBDL\b", "WEB-DL", RegexOptions.Compiled | RegexOptions.IgnoreCase);
title = Regex.Replace(title, @"\bКураж-Бамбей\b", "kurazh", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private ICollection<IndexerCategory> GetCategoryOfRelease(in IElement row)
{
var forum = row.QuerySelector("td.f-name-col > div.f-name > a");
var forumid = forum.GetAttribute("href").Split('=')[1];
return _categories.MapTrackerCatToNewznab(forumid);
title = Regex.Replace(title, @"\(\s*\/\s*", "(", RegexOptions.Compiled);
title = Regex.Replace(title, @"\s*\/\s*\)", ")", RegexOptions.Compiled);
title = Regex.Replace(title, @"[\[\(]\s*[\)\]]", "", RegexOptions.Compiled);
title = Regex.Replace(title, @"\s+\+(?:\s+\+)+\s+", " + ", RegexOptions.Compiled);
title = title.Trim(' ', '&', ',', '.', '!', '?', '+', '-', '_', '|', '/', '\\', ':');
// replace multiple spaces with a single space
title = Regex.Replace(title, @"\s+", " ");
return title.Trim();
}
private long GetSizeOfRelease(in IElement row)
private static bool IsAnyTvCategory(ICollection<IndexerCategory> category)
{
var qSize = row.QuerySelector("td.tor-size");
var size = ParseUtil.GetBytes(qSize.GetAttribute("data-ts_text"));
return size;
return category.Contains(NewznabStandardCategory.TV) || NewznabStandardCategory.TV.SubCategories.Any(subCat => category.Contains(subCat));
}
private DateTime GetPublishDateOfRelease(in IElement row)
private static bool IsAnyMovieCategory(ICollection<IndexerCategory> category)
{
var timestr = row.QuerySelector("td:nth-child(10)").GetAttribute("data-ts_text");
var publishDate = DateTimeUtil.UnixTimestampToDateTime(long.Parse(timestr));
return publishDate;
return category.Contains(NewznabStandardCategory.Movies) || NewznabStandardCategory.Movies.SubCategories.Any(subCat => category.Contains(subCat));
}
private bool IsAnyTvCategory(ICollection<IndexerCategory> category)
private static string MoveAllTagsToEndOfReleaseTitle(string input)
{
return category.Contains(NewznabStandardCategory.TV)
|| NewznabStandardCategory.TV.SubCategories.Any(subCat => category.Contains(subCat));
var output = input;
foreach (var findTagsRegex in FindTagsInTitlesRegexList)
{
foreach (Match match in findTagsRegex.Matches(input))
{
var tag = match.ToString();
output = $"{output.Replace(tag, "")} {tag}".Trim();
}
}
return output.Trim();
}
private bool IsAnyMovieCategory(ICollection<IndexerCategory> category)
private static string MoveFirstTagsToEndOfReleaseTitle(string input)
{
return category.Contains(NewznabStandardCategory.Movies)
|| NewznabStandardCategory.Movies.SubCategories.Any(subCat => category.Contains(subCat));
}
var output = input;
foreach (var findTagsRegex in FindTagsInTitlesRegexList)
{
var expectedIndex = 0;
foreach (Match match in findTagsRegex.Matches(output))
{
if (match.Index > expectedIndex)
{
var substring = output.Substring(expectedIndex, match.Index - expectedIndex);
if (string.IsNullOrWhiteSpace(substring))
{
expectedIndex = match.Index;
}
else
{
break;
}
}
var tag = match.ToString();
var regex = new Regex(Regex.Escape(tag));
output = $"{regex.Replace(output, string.Empty, 1)} {tag}".Trim();
expectedIndex += tag.Length;
}
}
public Action<IDictionary<string, string>, DateTime?> CookiesUpdater { get; set; }
return output.Trim();
}
}
public class RuTrackerSettings : UserPassTorrentBaseSettings
@ -1736,9 +1820,17 @@ namespace NzbDrone.Core.Indexers.Definitions
public RuTrackerSettings()
{
RussianLetters = false;
MoveFirstTagsToEndOfReleaseTitle = false;
MoveAllTagsToEndOfReleaseTitle = false;
}
[FieldDefinition(4, Label = "Strip Russian letters", Type = FieldType.Checkbox, SelectOptionsProviderAction = "stripRussian", HelpText = "Removes russian letters")]
[FieldDefinition(4, Label = "Strip Russian letters", Type = FieldType.Checkbox, HelpText = "Removes russian letters")]
public bool RussianLetters { get; set; }
[FieldDefinition(5, Label = "Move first tags to end of release title", Type = FieldType.Checkbox)]
public bool MoveFirstTagsToEndOfReleaseTitle { get; set; }
[FieldDefinition(6, Label = "Move all tags to end of release title", Type = FieldType.Checkbox)]
public bool MoveAllTagsToEndOfReleaseTitle { get; set; }
}
}

@ -1,7 +1,6 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
@ -52,16 +51,15 @@ namespace NzbDrone.Core.Indexers.Definitions
protected override async Task DoLogin()
{
var loginUrl = Settings.BaseUrl + "login.php";
var loginUrl = $"{Settings.BaseUrl}login.php";
var requestBuilder = new HttpRequestBuilder(loginUrl)
{
LogResponseContent = true,
AllowAutoRedirect = true,
Method = HttpMethod.Post
AllowAutoRedirect = true
};
var authLoginRequest = requestBuilder
var authLoginRequest = requestBuilder.Post()
.AddFormParameter("username", Settings.Username)
.AddFormParameter("password", Settings.Password)
.AddFormParameter("autologin", "on")
@ -76,8 +74,6 @@ namespace NzbDrone.Core.Indexers.Definitions
if (CheckIfLoginNeeded(response))
{
_logger.Debug(response.Content);
var parser = new HtmlParser();
var dom = parser.ParseDocument(response.Content);
var errorMessage = dom.QuerySelector("table.forumline table span.gen")?.FirstChild?.TextContent;
@ -328,16 +324,12 @@ namespace NzbDrone.Core.Indexers.Definitions
};
var queryCats = _capabilities.Categories.MapTorznabCapsToTrackers(categories);
if (queryCats.Any())
{
foreach (var cat in queryCats)
{
parameters.Add("f[]", $"{cat}");
}
queryCats.ForEach(cat => parameters.Add("f[]", $"{cat}"));
}
var searchUrl = _settings.BaseUrl + "tracker.php";
var searchUrl = $"{_settings.BaseUrl}tracker.php";
if (parameters.Count > 0)
{
@ -358,6 +350,8 @@ namespace NzbDrone.Core.Indexers.Definitions
private readonly TolokaSettings _settings;
private readonly IndexerCapabilitiesCategories _categories;
private readonly TolokaTitleParser _titleParser = new ();
public TolokaParser(TolokaSettings settings, IndexerCapabilitiesCategories categories)
{
_settings = settings;
@ -383,10 +377,9 @@ namespace NzbDrone.Core.Indexers.Definitions
}
var infoUrl = _settings.BaseUrl + row.QuerySelector("td:nth-child(3) > a")?.GetAttribute("href");
var title = row.QuerySelector("td:nth-child(3) > a")?.TextContent.Trim() ?? string.Empty;
var title = row.QuerySelector("td:nth-child(3) > a").TextContent.Trim();
var categoryLink = row.QuerySelector("td:nth-child(2) > a").GetAttribute("href");
var categoryLink = row.QuerySelector("td:nth-child(2) > a")?.GetAttribute("href") ?? string.Empty;
var cat = ParseUtil.GetArgumentFromQueryString(categoryLink, "f");
var categories = _categories.MapTrackerCatToNewznab(cat);
@ -394,14 +387,15 @@ namespace NzbDrone.Core.Indexers.Definitions
var peers = seeders + ParseUtil.CoerceInt(row.QuerySelector("td:nth-child(11) > b")?.TextContent.Trim());
// 2023-01-21
var added = row.QuerySelector("td:nth-child(13)").TextContent.Trim();
var added = row.QuerySelector("td:nth-child(13)")?.TextContent.Trim() ?? string.Empty;
var release = new TorrentInfo
{
Guid = infoUrl,
InfoUrl = infoUrl,
DownloadUrl = _settings.BaseUrl + downloadUrl,
Title = CleanTitle(title, categories, _settings.StripCyrillicLetters),
Title = _titleParser.Parse(title, categories, _settings.StripCyrillicLetters),
Description = title,
Categories = categories,
Seeders = seeders,
Peers = peers,
@ -420,27 +414,65 @@ namespace NzbDrone.Core.Indexers.Definitions
return releaseInfos.ToArray();
}
private static bool IsAnyTvCategory(ICollection<IndexerCategory> category)
{
return category.Contains(NewznabStandardCategory.TV) || NewznabStandardCategory.TV.SubCategories.Any(subCategory => category.Contains(subCategory));
}
public Action<IDictionary<string, string>, DateTime?> CookiesUpdater { get; set; }
}
private static string CleanTitle(string title, ICollection<IndexerCategory> categories, bool stripCyrillicLetters = true)
public class TolokaTitleParser
{
private static readonly List<Regex> FindTagsInTitlesRegexList = new ()
{
var tvShowTitleRegex = new Regex(".+\\/\\s([^а-яА-я\\/]+)\\s\\/.+Сезон\\s*[:]*\\s+(\\d+).+(?:Серії|Епізод)+\\s*[:]*\\s+(\\d+-*\\d*).+,\\s+(.+)\\]\\s(.+)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
var stripCyrillicRegex = new Regex(@"(\([\p{IsCyrillic}\W]+\))|(^[\p{IsCyrillic}\W\d]+\/ )|([\p{IsCyrillic} \-]+,+)|([\p{IsCyrillic}]+)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
new Regex(@"\((?>\((?<c>)|[^()]+|\)(?<-c>))*(?(c)(?!))\)"),
new Regex(@"\[(?>\[(?<c>)|[^\[\]]+|\](?<-c>))*(?(c)(?!))\]")
};
private readonly Regex _tvTitleCommaRegex = new (@"\s(\d+),(\d+)", RegexOptions.Compiled);
private readonly Regex _tvTitleCyrillicXRegex = new (@"([\s-])Х+([\)\]])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleMultipleSeasonsRegex = new (@"(?:Сезон|Seasons?)\s*[:]*\s+(\d+-\d+)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleUkrSeasonEpisodeOfRegex = new (@"Сезон\s*[:]*\s+(\d+).+(?:Серії|Серія|Серій|Епізод)+\s*[:]*\s+(\d+(?:-\d+)?)\s*з\s*([\w?])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleUkrSeasonEpisodeRegex = new (@"Сезон\s*[:]*\s+(\d+).+(?:Серії|Серія|Серій|Епізод)+\s*[:]*\s+(\d+(?:-\d+)?)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleUkrSeasonRegex = new (@"Сезон\s*[:]*\s+(\d+)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleUkrEpisodeOfRegex = new (@"(?:Серії|Серія|Серій|Епізод)+\s*[:]*\s+(\d+(?:-\d+)?)\s*з\s*([\w?])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleUkrEpisodeRegex = new (@"(?:Серії|Серія|Серій|Епізод)+\s*[:]*\s+(\d+(?:-\d+)?)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleEngSeasonEpisodeOfRegex = new (@"Season\s*[:]*\s+(\d+).+(?:Episodes?)+\s*[:]*\s+(\d+(?:-\d+)?)\s*of\s*([\w?])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleEngSeasonEpisodeRegex = new (@"Season\s*[:]*\s+(\d+).+(?:Episodes?)+\s*[:]*\s+(\d+(?:-\d+)?)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleEngSeasonRegex = new (@"Season\s*[:]*\s+(\d+(?:-\d+)?)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleEngEpisodeOfRegex = new (@"(?:Episodes?)+\s*[:]*\s+(\d+(?:-\d+)?)\s*of\s*([\w?])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _tvTitleEngEpisodeRegex = new (@"(?:Episodes?)+\s*[:]+\s*[:]*\s+(\d+(?:-\d+)?)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Regex _stripCyrillicRegex = new (@"(\([\p{IsCyrillic}\W]+\))|(^[\p{IsCyrillic}\W\d]+\/ )|([\p{IsCyrillic} \-]+,+)|([\p{IsCyrillic}]+)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
public string Parse(string title, ICollection<IndexerCategory> categories, bool stripCyrillicLetters = true)
{
// https://www.fileformat.info/info/unicode/category/Pd/list.htm
title = Regex.Replace(title, "\\p{Pd}", "-", RegexOptions.Compiled | RegexOptions.IgnoreCase);
title = Regex.Replace(title, @"\p{Pd}", "-", RegexOptions.Compiled | RegexOptions.IgnoreCase);
if (IsAnyTvCategory(categories))
{
// extract season and episodes
title = tvShowTitleRegex.Replace(title, "$1 - S$2E$3 - rus $4 $5");
title = _tvTitleCommaRegex.Replace(title, " $1-$2");
title = _tvTitleCyrillicXRegex.Replace(title, "$1XX$2");
// special case for multiple seasons
title = _tvTitleMultipleSeasonsRegex.Replace(title, "S$1");
title = _tvTitleUkrSeasonEpisodeOfRegex.Replace(title, "S$1E$2 of $3");
title = _tvTitleUkrSeasonEpisodeRegex.Replace(title, "S$1E$2");
title = _tvTitleUkrSeasonRegex.Replace(title, "S$1");
title = _tvTitleUkrEpisodeOfRegex.Replace(title, "E$1 of $2");
title = _tvTitleUkrEpisodeRegex.Replace(title, "E$1");
title = _tvTitleEngSeasonEpisodeOfRegex.Replace(title, "S$1E$2 of $3");
title = _tvTitleEngSeasonEpisodeRegex.Replace(title, "S$1E$2");
title = _tvTitleEngSeasonRegex.Replace(title, "S$1");
title = _tvTitleEngEpisodeOfRegex.Replace(title, "E$1 of $2");
title = _tvTitleEngEpisodeRegex.Replace(title, "E$1");
}
else if (stripCyrillicLetters)
if (stripCyrillicLetters)
{
title = stripCyrillicRegex.Replace(title, string.Empty);
title = _stripCyrillicRegex.Replace(title, string.Empty).Trim(' ', '-');
}
title = Regex.Replace(title, @"\b-Rip\b", "Rip", RegexOptions.Compiled | RegexOptions.IgnoreCase);
@ -449,10 +481,56 @@ namespace NzbDrone.Core.Indexers.Definitions
title = Regex.Replace(title, @"\bWEBDLRip\b", "WEB-DL", RegexOptions.Compiled | RegexOptions.IgnoreCase);
title = Regex.Replace(title, @"\bWEBDL\b", "WEB-DL", RegexOptions.Compiled | RegexOptions.IgnoreCase);
return title.Trim(' ', '.', '-', '_', '|', '/', '\'');
title = MoveFirstTagsToEndOfReleaseTitle(title);
title = Regex.Replace(title, @"\(\s*\/\s*", "(", RegexOptions.Compiled);
title = Regex.Replace(title, @"\s*\/\s*\)", ")", RegexOptions.Compiled);
title = Regex.Replace(title, @"[\[\(]\s*[\)\]]", "", RegexOptions.Compiled);
title = title.Trim(' ', '&', ',', '.', '!', '?', '+', '-', '_', '|', '/', '\\', ':');
// replace multiple spaces with a single space
title = Regex.Replace(title, @"\s+", " ");
return title.Trim();
}
public Action<IDictionary<string, string>, DateTime?> CookiesUpdater { get; set; }
private static bool IsAnyTvCategory(ICollection<IndexerCategory> category)
{
return category.Contains(NewznabStandardCategory.TV) || NewznabStandardCategory.TV.SubCategories.Any(subCategory => category.Contains(subCategory));
}
private static string MoveFirstTagsToEndOfReleaseTitle(string input)
{
var output = input;
foreach (var findTagsRegex in FindTagsInTitlesRegexList)
{
var expectedIndex = 0;
foreach (Match match in findTagsRegex.Matches(output))
{
if (match.Index > expectedIndex)
{
var substring = output.Substring(expectedIndex, match.Index - expectedIndex);
if (string.IsNullOrWhiteSpace(substring))
{
expectedIndex = match.Index;
}
else
{
break;
}
}
var tag = match.ToString();
var regex = new Regex(Regex.Escape(tag));
output = $"{regex.Replace(output, string.Empty, 1)} {tag}".Trim();
expectedIndex += tag.Length;
}
}
return output.Trim();
}
}
public class TolokaSettings : UserPassTorrentBaseSettings

Loading…
Cancel
Save