Change method used to get site ranking. Not only has alexa.com changed the format of their web site, they also seem to have gotten even more picky about people scraping it. So, use their API to query the site data.

Not sure why the API was not used earlier.  It is much more efficient as much less data is passed.  And, it is less likely to have its format changed randomly.  It did flake out on some sites one time I used it.  But, then it worked reliably next.  I am not going to do many requests, as it probably has some query rate limit.
pull/209/head
Christopher K. Hoadley 5 years ago
parent b8341a1e6a
commit bd70be289b

@ -5,7 +5,7 @@ import json
import sys import sys
import requests import requests
import threading import threading
from bs4 import BeautifulSoup as bs import xml.etree.ElementTree as ET
from datetime import datetime from datetime import datetime
from argparse import ArgumentParser, RawDescriptionHelpFormatter from argparse import ArgumentParser, RawDescriptionHelpFormatter
@ -13,17 +13,20 @@ pool = list()
def get_rank(domain_to_query, dest): def get_rank(domain_to_query, dest):
result = -1 result = -1
url = "http://www.alexa.com/siteinfo/" + domain_to_query
page = requests.get(url).text #Retrieve ranking data via alexa API
soup = bs(page, features="lxml") url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}"
for span in soup.find_all('span'): xml_data = requests.get(url).text
if span.has_attr("class"): root = ET.fromstring(xml_data)
if "globleRank" in span["class"]: try:
for strong in span.find_all("strong"): #Get ranking for this site.
if strong.has_attr("class"): dest['rank'] = int(root.find(".//REACH").attrib["RANK"])
if "metrics-data" in strong["class"]: except:
result = int(strong.text.strip().replace(',', '')) #We did not find the rank for some reason.
dest['rank'] = result print(f"Error retrieving rank information for '{domain_to_query}'")
print(f" Returned XML is |{xml_data}|")
return
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
) )

Loading…
Cancel
Save