From bd70be289bdf4eef68a5e56c51ac81748f986288 Mon Sep 17 00:00:00 2001 From: "Christopher K. Hoadley" Date: Tue, 28 May 2019 07:26:30 -0500 Subject: [PATCH] Change method used to get site ranking. Not only has alexa.com changed the format of their web site, they also seem to have gotten even more picky about people scraping it. So, use their API to query the site data. Not sure why the API was not used earlier. It is much more efficient as much less data is passed. And, it is less likely to have its format changed randomly. It did flake out on some sites one time I used it. But, then it worked reliably next. I am not going to do many requests, as it probably has some query rate limit. --- site_list.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/site_list.py b/site_list.py index 4b12000..3cc19a3 100644 --- a/site_list.py +++ b/site_list.py @@ -5,7 +5,7 @@ import json import sys import requests import threading -from bs4 import BeautifulSoup as bs +import xml.etree.ElementTree as ET from datetime import datetime from argparse import ArgumentParser, RawDescriptionHelpFormatter @@ -13,17 +13,20 @@ pool = list() def get_rank(domain_to_query, dest): result = -1 - url = "http://www.alexa.com/siteinfo/" + domain_to_query - page = requests.get(url).text - soup = bs(page, features="lxml") - for span in soup.find_all('span'): - if span.has_attr("class"): - if "globleRank" in span["class"]: - for strong in span.find_all("strong"): - if strong.has_attr("class"): - if "metrics-data" in strong["class"]: - result = int(strong.text.strip().replace(',', '')) - dest['rank'] = result + + #Retrieve ranking data via alexa API + url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}" + xml_data = requests.get(url).text + root = ET.fromstring(xml_data) + try: + #Get ranking for this site. + dest['rank'] = int(root.find(".//REACH").attrib["RANK"]) + except: + #We did not find the rank for some reason. + print(f"Error retrieving rank information for '{domain_to_query}'") + print(f" Returned XML is |{xml_data}|") + + return parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter )