From 46705ad0a17be8ed8947cd9b040175cc5a6d4ac0 Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Thu, 24 Jan 2019 11:01:34 +0000 Subject: [PATCH 01/11] Add Alexa.com rank updating functionality to site_list.py --- data.json | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++ site_list.py | 43 +++++++++++++---- sites.md | 2 + 3 files changed, 169 insertions(+), 9 deletions(-) diff --git a/data.json b/data.json index 84412975..f394048a 100644 --- a/data.json +++ b/data.json @@ -2,83 +2,98 @@ "500px": { "errorMsg": "Sorry, no such page.", "errorType": "message", + "rank": 2461, "url": "https://500px.com/{}", "urlMain": "https://500px.com/" }, "9GAG": { "errorType": "status_code", + "rank": 333, "url": "https://9gag.com/u/{}", "urlMain": "https://9gag.com/" }, "About.me": { "errorType": "status_code", + "rank": 12686, "url": "https://about.me/{}", "urlMain": "https://about.me/" }, "Academia.edu": { "errorMsg": "Page Not Found", "errorType": "message", + "rank": 385, "url": "https://independent.academia.edu/{}", "urlMain": "https://www.academia.edu/" }, "AngelList": { "errorMsg": "We couldn't find what you were looking for.", "errorType": "message", + "rank": 3469, "url": "https://angel.co/{}", "urlMain": "https://angel.co/" }, "Aptoide": { "errorType": "status_code", + "rank": 6107, "url": "https://{}.en.aptoide.com/", "urlMain": "https://en.aptoide.com/" }, "AskFM": { "errorType": "status_code", + "rank": 1109, "url": "https://ask.fm/{}", "urlMain": "https://ask.fm/" }, "BLIP.fm": { "errorMsg": "Page Not Found", "errorType": "message", + "rank": 261919, "url": "https://blip.fm/{}", "urlMain": "https://blip.fm/" }, "Badoo": { "errorType": "status_code", + "rank": 949, "url": "https://badoo.com/profile/{}", "urlMain": "https://badoo.com/" }, "Bandcamp": { "errorMsg": "Sorry, that something isn\u2019t here", "errorType": "message", + "rank": 573, "url": "https://www.bandcamp.com/{}", "urlMain": "https://www.bandcamp.com/" }, "Basecamp": { "errorMsg": "The account you were looking for doesn't exist", "errorType": "message", + "rank": 1559, "url": "https://{}.basecamphq.com", "urlMain": "https://basecamp.com/" }, "Behance": { "errorMsg": "Oops! We can\u2019t find that page.", "errorType": "message", + "rank": 394, "url": "https://www.behance.net/{}", "urlMain": "https://www.behance.net/" }, "BitBucket": { "errorType": "status_code", + "rank": 848, "url": "https://bitbucket.org/{}", "urlMain": "https://bitbucket.org/" }, "BlackPlanet": { "errorMsg": "My Hits", "errorType": "message", + "rank": 107509, "url": "http://blackplanet.com/{}", "urlMain": "http://blackplanet.com/" }, "Blogger": { "errorType": "status_code", + "rank": 193, "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", "url": "https://{}.blogspot.com", "urlMain": "https://www.blogger.com/" @@ -86,112 +101,132 @@ "BuzzFeed": { "errorMsg": "We can't find the page you're looking for.", "errorType": "message", + "rank": 294, "url": "https://buzzfeed.com/{}", "urlMain": "https://buzzfeed.com/" }, "Canva": { "errorMsg": "Not found (404)", "errorType": "message", + "rank": 215, "url": "https://www.canva.com/{}", "urlMain": "https://www.canva.com/" }, "Carbonmade": { "errorMsg": "You've accidentally stumbled upon Mike's super secret nap grotto.", "errorType": "message", + "rank": 32239, "url": "https://{}.carbonmade.com", "urlMain": "https://carbonmade.com/" }, "CashMe": { "errorType": "status_code", + "rank": 45066, "url": "https://cash.me/{}", "urlMain": "https://cash.me/" }, "Cloob": { "errorType": "status_code", + "rank": 8052, "url": "https://www.cloob.com/name/{}", "urlMain": "https://www.cloob.com/" }, "Codecademy": { "errorMsg": "404 error", "errorType": "message", + "rank": 2314, "url": "https://www.codecademy.com/{}", "urlMain": "https://www.codecademy.com/" }, "Codementor": { "errorMsg": "404", "errorType": "message", + "rank": 12456, "url": "https://www.codementor.io/{}", "urlMain": "https://www.codementor.io/" }, "Codepen": { "errorType": "status_code", + "rank": 863, "url": "https://codepen.io/{}", "urlMain": "https://codepen.io/" }, "Coderwall": { "errorMsg": "404! Our feels when that url is used", "errorType": "message", + "rank": 17346, "url": "https://coderwall.com/{}", "urlMain": "https://coderwall.com/" }, "ColourLovers": { "errorMsg": "Page Not Loved", "errorType": "message", + "rank": 30625, "url": "https://www.colourlovers.com/love/{}", "urlMain": "https://www.colourlovers.com/" }, "Contently": { "errorMsg": "We can't find that page!", "errorType": "message", + "rank": 59032, "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", "url": "https://{}.contently.com/", "urlMain": "https://contently.com/" }, "Coroflot": { "errorType": "status_code", + "rank": 37568, "url": "https://www.coroflot.com/{}", "urlMain": "https://coroflot.com/" }, "CreativeMarket": { "errorType": "status_code", + "rank": 1790, "url": "https://creativemarket.com/{}", "urlMain": "https://creativemarket.com/" }, "Crevado": { "errorType": "status_code", + "rank": 168903, "url": "https://{}.crevado.com", "urlMain": "https://crevado.com/" }, "Crunchyroll": { "errorType": "status_code", + "rank": 463, "url": "https://www.crunchyroll.com/user/{}", "urlMain": "https://www.crunchyroll.com/" }, "DailyMotion": { "errorType": "status_code", + "rank": 132, "url": "https://www.dailymotion.com/{}", "urlMain": "https://www.dailymotion.com/" }, "Designspiration": { "errorMsg": "Content Not Found", "errorType": "message", + "rank": 24722, "url": "https://www.designspiration.net/{}", "urlMain": "https://www.designspiration.net/" }, "DeviantART": { "errorType": "status_code", + "rank": 185, "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", "url": "https://{}.deviantart.com", "urlMain": "https://deviantart.com" }, "Disqus": { "errorType": "status_code", + "rank": 1311, "url": "https://disqus.com/{}", "urlMain": "https://disqus.com/" }, "Dribbble": { "errorMsg": "Whoops, that page is gone.", "errorType": "message", + "rank": 937, "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", "url": "https://dribbble.com/{}", "urlMain": "https://dribbble.com/" @@ -199,66 +234,78 @@ "EVE Online": { "errorMsg": "No results found with your search...", "errorType": "message", + "rank": 11655, "url": "https://evewho.com/search/{}", "urlMain": "https://eveonline.com" }, "Ebay": { "errorMsg": "The User ID you entered was not found", "errorType": "message", + "rank": 37, "url": "https://www.ebay.com/usr/{}", "urlMain": "https://www.ebay.com/" }, "Ello": { "errorMsg": "We couldn't find the page you're looking for", "errorType": "message", + "rank": 29841, "url": "https://ello.co/{}", "urlMain": "https://ello.co/" }, "Etsy": { "errorType": "status_code", + "rank": 152, "url": "https://www.etsy.com/shop/{}", "urlMain": "https://www.etsy.com/" }, "EyeEm": { "errorType": "status_code", + "rank": 33189, "url": "https://www.eyeem.com/u/{}", "urlMain": "https://www.eyeem.com/" }, "Facebook": { "errorType": "status_code", + "rank": 3, "regexCheck": "^[a-zA-Z0-9]{4,49}(? Date: Thu, 24 Jan 2019 11:16:23 +0000 Subject: [PATCH 02/11] Correct print look --- site_list.py | 2 +- sites.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/site_list.py b/site_list.py index dc969b61..e7d42170 100644 --- a/site_list.py +++ b/site_list.py @@ -39,7 +39,7 @@ with open("sites.md", "w") as site_file: data.get(social_network)["rank"] = get_rank(url_main) index = index + 1 - site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n') + site_file.write(f'\nAlexa.com rank data fetched at {datetime.utcnow()} UTC\n') sorted_json_data = json.dumps(data, indent=2, sort_keys=True) diff --git a/sites.md b/sites.md index d6569e06..630b531e 100644 --- a/sites.md +++ b/sites.md @@ -133,4 +133,4 @@ 132. [iMGSRC.RU](https://imgsrc.ru/) 133. [last.fm](https://last.fm/) -Alexa.com rank data fetched at (2019-01-24 10:58:49.318475 UTC) +Alexa.com rank data fetched at 2019-01-24 10:58:49.318475 UTC From 40fc51fc320a2636c0dff5cf72d6a23da57c00ff Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Thu, 24 Jan 2019 12:35:08 +0000 Subject: [PATCH 03/11] add rank paramether to site_list.py --rank or -r to update all page ranks --- site_list.py | 20 ++++++++++++++------ sites.md | 2 -- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/site_list.py b/site_list.py index e7d42170..bdc3598c 100644 --- a/site_list.py +++ b/site_list.py @@ -5,6 +5,7 @@ This module generates the listing of supported sites. import json import sys import requests +from argparse import ArgumentParser, RawDescriptionHelpFormatter from bs4 import BeautifulSoup as bs from datetime import datetime from collections import OrderedDict @@ -15,7 +16,7 @@ def get_rank(domain_to_query): page = requests.get(url).text soup = bs(page, features="lxml") for span in soup.find_all('span'): - if span.has_attr("class"): + if span.has_attr("class"): if "globleRank" in span["class"]: for strong in span.find_all("strong"): if strong.has_attr("class"): @@ -23,6 +24,14 @@ def get_rank(domain_to_query): result = int(strong.text.strip().replace(',', '')) return result +parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter + ) +parser.add_argument("--rank","-r", + action="store_true", dest="rank", default=False, + help="Update all website ranks (not recommended)." + ) +args = parser.parse_args() + with open("data.json", "r", encoding="utf-8") as data_file: data = json.load(data_file) @@ -34,13 +43,12 @@ with open("sites.md", "w") as site_file: for social_network in data: url_main = data.get(social_network).get("urlMain") site_file.write(f'{index}. [{social_network}]({url_main})\n') - sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries")) - sys.stdout.flush() - data.get(social_network)["rank"] = get_rank(url_main) + if args.rank == True: + data.get(social_network)["rank"] = get_rank(url_main) + sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries")) + sys.stdout.flush() index = index + 1 - site_file.write(f'\nAlexa.com rank data fetched at {datetime.utcnow()} UTC\n') - sorted_json_data = json.dumps(data, indent=2, sort_keys=True) with open("data.json", "w") as data_file: diff --git a/sites.md b/sites.md index 630b531e..2909bfa3 100644 --- a/sites.md +++ b/sites.md @@ -132,5 +132,3 @@ 131. [devRant](https://devrant.com/) 132. [iMGSRC.RU](https://imgsrc.ru/) 133. [last.fm](https://last.fm/) - -Alexa.com rank data fetched at 2019-01-24 10:58:49.318475 UTC From cc2b1cb27a7e68d7a1b17cc09dfc0bcf2a9532f8 Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Thu, 24 Jan 2019 12:50:02 +0000 Subject: [PATCH 04/11] Improve terminal appearence for site_list.py --- site_list.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/site_list.py b/site_list.py index bdc3598c..a1b73692 100644 --- a/site_list.py +++ b/site_list.py @@ -54,4 +54,5 @@ sorted_json_data = json.dumps(data, indent=2, sort_keys=True) with open("data.json", "w") as data_file: data_file.write(sorted_json_data) -print("\nFinished updating supported site listing!") +sys.stdout.write("\r{0}".format(f"Finished updating supported site listing!\n")) +sys.stdout.flush() From 9c45146da190759c046527a1d8c375b04d54b496 Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Thu, 24 Jan 2019 12:54:09 +0000 Subject: [PATCH 05/11] remove unused import --- site_list.py | 1 - 1 file changed, 1 deletion(-) diff --git a/site_list.py b/site_list.py index a1b73692..bca22d0d 100644 --- a/site_list.py +++ b/site_list.py @@ -7,7 +7,6 @@ import sys import requests from argparse import ArgumentParser, RawDescriptionHelpFormatter from bs4 import BeautifulSoup as bs -from datetime import datetime from collections import OrderedDict def get_rank(domain_to_query): From 826af1ec1908ba269a530fb61be3be7da5281b56 Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Fri, 25 Jan 2019 12:45:55 +0000 Subject: [PATCH 06/11] remove unused import --- site_list.py | 1 - 1 file changed, 1 deletion(-) diff --git a/site_list.py b/site_list.py index bca22d0d..8292be4e 100644 --- a/site_list.py +++ b/site_list.py @@ -7,7 +7,6 @@ import sys import requests from argparse import ArgumentParser, RawDescriptionHelpFormatter from bs4 import BeautifulSoup as bs -from collections import OrderedDict def get_rank(domain_to_query): result = -1 From db0cf7c289a0a6895214fc2e87baf7eb4138deed Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Fri, 25 Jan 2019 12:46:05 +0000 Subject: [PATCH 07/11] Update requirements.txt --- requirements.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5a98de85..c70b9a6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,13 @@ +beautifulsoup4 +bs4 +certifi +chardet +colorama +idna +PySocks requests -requests_futures +requests-futures +soupsieve +stem torrequest -colorama \ No newline at end of file +urllib3 From 55d43b0ee6e1ff309af1c95aecee964939fb50d2 Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Fri, 25 Jan 2019 12:50:50 +0000 Subject: [PATCH 08/11] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c70b9a6d..4406938b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ certifi chardet colorama idna +lxml PySocks requests requests-futures From 5d972a31387ac88635346af53277eb1c9f56f6d0 Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Fri, 25 Jan 2019 15:05:38 +0000 Subject: [PATCH 09/11] add --rank -r option to sherlock --- sherlock.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sherlock.py b/sherlock.py index a05294fc..d72abaa5 100644 --- a/sherlock.py +++ b/sherlock.py @@ -346,6 +346,9 @@ def main(): action="store_true", dest="verbose", default=False, help="Display extra debugging information and metrics." ) + parser.add_argument("--rank", "-r", + action="store_true", dest="rank", default=False, + help="Present websites ordered by their Alexa.com rank in popularity.") parser.add_argument("--folderoutput", "-fo", dest="folderoutput", help="If using multiple usernames, the output of the results will be saved at this folder." ) @@ -464,6 +467,14 @@ def main(): f"Error: Desired sites not found: {', '.join(site_missing)}.") sys.exit(1) + if args.rank: + # Sort data by rank + site_dataCpy = dict(site_data) + ranked_site_data = sorted(site_data, key=lambda k: site_data[k]['rank']) + site_data = {} + for site in ranked_site_data: + site_data[site] = site_dataCpy.get(site) + # Run report on all specified users. for username in args.username: print() From 78ade00deef268b88d4d7eff4c0b2d1e127c9163 Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Fri, 25 Jan 2019 15:10:03 +0000 Subject: [PATCH 10/11] Update outdated REAME.md --- README.md | 10 +++++++++- sherlock.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6ac9b396..522d3533 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,12 @@ optional arguments: --version Display version information and dependencies. --verbose, -v, -d, --debug Display extra debugging information and metrics. - --quiet, -q Disable debugging information (Default Option). + --folderoutput FOLDEROUTPUT, -fo FOLDEROUTPUT + If using multiple usernames, the output of the results + will be saved at this folder. + --output OUTPUT, -o OUTPUT + If using single username, the output of the result + will be saved at this file. --tor, -t Make requests over TOR; increases runtime; requires TOR to be installed and in system path. --unique-tor, -u Make requests over TOR with new TOR circuit after each @@ -55,6 +60,9 @@ optional arguments: --proxy PROXY_URL, -p PROXY_URL Make requests over a proxy. e.g. socks5://127.0.0.1:1080 + --json JSON_FILE, -j JSON_FILE + Load data from a JSON file or an online, valid, JSON + file. ``` For example, run ```python3 sherlock.py user123```, and all of the accounts diff --git a/sherlock.py b/sherlock.py index d72abaa5..61e82f75 100644 --- a/sherlock.py +++ b/sherlock.py @@ -348,7 +348,7 @@ def main(): ) parser.add_argument("--rank", "-r", action="store_true", dest="rank", default=False, - help="Present websites ordered by their Alexa.com rank in popularity.") + help="Present websites ordered by their Alexa.com global rank in popularity.") parser.add_argument("--folderoutput", "-fo", dest="folderoutput", help="If using multiple usernames, the output of the results will be saved at this folder." ) From 8b681158bc8be0fea89bd0b402352d4bc5d3032d Mon Sep 17 00:00:00 2001 From: ptalmeida Date: Fri, 25 Jan 2019 17:36:38 +0000 Subject: [PATCH 11/11] small corrections to rank sort --- sherlock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sherlock.py b/sherlock.py index 61e82f75..7cb9a2c8 100644 --- a/sherlock.py +++ b/sherlock.py @@ -470,9 +470,9 @@ def main(): if args.rank: # Sort data by rank site_dataCpy = dict(site_data) - ranked_site_data = sorted(site_data, key=lambda k: site_data[k]['rank']) + ranked_sites = sorted(site_data, key=lambda k: ("rank" not in k, site_data[k].get("rank", sys.maxsize))) site_data = {} - for site in ranked_site_data: + for site in ranked_sites: site_data[site] = site_dataCpy.get(site) # Run report on all specified users.