From 1442f333c2a727e45cb798c692985940fa79c326 Mon Sep 17 00:00:00 2001 From: Avinash Shenoy Date: Sun, 27 Jan 2019 15:01:55 +0530 Subject: [PATCH 1/2] Parallelised updating Alexa.com ranking of sites Script now fetches Alexa ranks for sites concurrently on separate threads. Cuts down the time to sync ranks from approximately **5 minutes** to about **18 seconds**. --- site_list.py | 65 ++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/site_list.py b/site_list.py index 8292be4e..cf8f414d 100644 --- a/site_list.py +++ b/site_list.py @@ -1,56 +1,55 @@ """Sherlock: Supported Site Listing - This module generates the listing of supported sites. """ import json import sys import requests -from argparse import ArgumentParser, RawDescriptionHelpFormatter +import threading from bs4 import BeautifulSoup as bs - -def get_rank(domain_to_query): - result = -1 - url = "http://www.alexa.com/siteinfo/" + domain_to_query - page = requests.get(url).text - soup = bs(page, features="lxml") - for span in soup.find_all('span'): - if span.has_attr("class"): - if "globleRank" in span["class"]: - for strong in span.find_all("strong"): - if strong.has_attr("class"): - if "metrics-data" in strong["class"]: - result = int(strong.text.strip().replace(',', '')) - return result - -parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter - ) -parser.add_argument("--rank","-r", - action="store_true", dest="rank", default=False, - help="Update all website ranks (not recommended)." - ) -args = parser.parse_args() +from datetime import datetime + +pool = list() + +def get_rank(domain_to_query, dest): + result = -1 + url = "http://www.alexa.com/siteinfo/" + domain_to_query + page = requests.get(url).text + soup = bs(page, features="lxml") + for span in soup.find_all('span'): + if span.has_attr("class"): + if "globleRank" in span["class"]: + for strong in span.find_all("strong"): + if strong.has_attr("class"): + if "metrics-data" in strong["class"]: + result = int(strong.text.strip().replace(',', '')) + dest = result with open("data.json", "r", encoding="utf-8") as data_file: - data = json.load(data_file) + data = json.load(data_file) with open("sites.md", "w") as site_file: data_length = len(data) site_file.write(f'## List Of Supported Sites ({data_length} Sites In Total!)\n') - index = 1 for social_network in data: url_main = data.get(social_network).get("urlMain") + th = threading.Thread(target=get_rank, args=(url_main, data.get(social_network)["rank"])) + pool.append((url_main, th)) + th.start() + + index = 1 + for social_network, th in pool: + th.join() site_file.write(f'{index}. [{social_network}]({url_main})\n') - if args.rank == True: - data.get(social_network)["rank"] = get_rank(url_main) - sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries")) - sys.stdout.flush() + sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries")) + sys.stdout.flush() index = index + 1 + site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n') + sorted_json_data = json.dumps(data, indent=2, sort_keys=True) with open("data.json", "w") as data_file: - data_file.write(sorted_json_data) + data_file.write(sorted_json_data) -sys.stdout.write("\r{0}".format(f"Finished updating supported site listing!\n")) -sys.stdout.flush() +print("\nFinished updating supported site listing!") From 3db3f4558bd94f6f088a76f47eb21dc869fe1b56 Mon Sep 17 00:00:00 2001 From: Avinash Shenoy Date: Sun, 27 Jan 2019 15:20:45 +0530 Subject: [PATCH 2/2] Parallelized updating alexa ranking --- site_list.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/site_list.py b/site_list.py index cf8f414d..1d6b2214 100644 --- a/site_list.py +++ b/site_list.py @@ -7,6 +7,7 @@ import requests import threading from bs4 import BeautifulSoup as bs from datetime import datetime +from argparse import ArgumentParser, RawDescriptionHelpFormatter pool = list() @@ -22,7 +23,15 @@ def get_rank(domain_to_query, dest): if strong.has_attr("class"): if "metrics-data" in strong["class"]: result = int(strong.text.strip().replace(',', '')) - dest = result + dest['rank'] = result + +parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter + ) +parser.add_argument("--rank","-r", + action="store_true", dest="rank", default=False, + help="Update all website ranks (not recommended)." + ) +args = parser.parse_args() with open("data.json", "r", encoding="utf-8") as data_file: data = json.load(data_file) @@ -33,19 +42,26 @@ with open("sites.md", "w") as site_file: for social_network in data: url_main = data.get(social_network).get("urlMain") - th = threading.Thread(target=get_rank, args=(url_main, data.get(social_network)["rank"])) - pool.append((url_main, th)) - th.start() + data.get(social_network)["rank"] = 0 + if args.rank: + th = threading.Thread(target=get_rank, args=(url_main, data.get(social_network))) + else: + th = None + pool.append((url_main, url_main, th)) + if args.rank: + th.start() index = 1 - for social_network, th in pool: - th.join() + for social_network, url_main, th in pool: + if args.rank: + th.join() site_file.write(f'{index}. [{social_network}]({url_main})\n') sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries")) sys.stdout.flush() index = index + 1 - site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n') + if args.rank: + site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n') sorted_json_data = json.dumps(data, indent=2, sort_keys=True)