Merge pull request #3 from nareddyt/async-requests

Run HTTP requests in parallel
pull/48/head
Siddharth Dushantha 6 years ago committed by GitHub
commit 0f9519365f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

3
.gitignore vendored

@ -1,3 +1,6 @@
# Virtual Environment
venv/
# Jupyter Notebook # Jupyter Notebook
.ipynb_checkpoints .ipynb_checkpoints
*.ipynb *.ipynb

Binary file not shown.

Before

Width:  |  Height:  |  Size: 92 KiB

@ -1,2 +1,3 @@
requests requests
requests_futures
torrequest torrequest

@ -3,10 +3,12 @@
This module contains the main logic to search for usernames at social This module contains the main logic to search for usernames at social
networks. networks.
""" """
import requests import requests
from concurrent.futures import ThreadPoolExecutor
from requests_futures.sessions import FuturesSession
import json import json
import os import os
import sys
import re import re
import csv import csv
from argparse import ArgumentParser, RawDescriptionHelpFormatter from argparse import ArgumentParser, RawDescriptionHelpFormatter
@ -16,27 +18,24 @@ from torrequest import TorRequest
module_name = "Sherlock: Find Usernames Across Social Networks" module_name = "Sherlock: Find Usernames Across Social Networks"
__version__ = "0.1.0" __version__ = "0.1.0"
# TODO: fix tumblr # TODO: fix tumblr
def write_to_file(url, fname): def write_to_file(url, fname):
with open(fname, "a") as f: with open(fname, "a") as f:
f.write(url+"\n") f.write(url + "\n")
def print_error(err, errstr, var, debug = False): def print_error(err, errstr, var, debug=False):
if debug: if debug:
print(f"\033[37;1m[\033[91;1m-\033[37;1m]\033[91;1m {errstr}\033[93;1m {err}") print(f"\033[37;1m[\033[91;1m-\033[37;1m]\033[91;1m {errstr}\033[93;1m {err}")
else: else:
print(f"\033[37;1m[\033[91;1m-\033[37;1m]\033[91;1m {errstr}\033[93;1m {var}") print(f"\033[37;1m[\033[91;1m-\033[37;1m]\033[91;1m {errstr}\033[93;1m {var}")
def make_request(url, headers, error_type, social_network, verbose=False, tor=False, unique_tor=False): def get_response(request_future, error_type, social_network, verbose=False):
r = TorRequest() if (tor or unique_tor) else requests
try: try:
rsp = r.get(url, headers=headers) rsp = request_future.result()
if unique_tor:
r.reset_identity()
if rsp.status_code: if rsp.status_code:
return rsp, error_type return rsp, error_type
except requests.exceptions.HTTPError as errh: except requests.exceptions.HTTPError as errh:
@ -74,95 +73,146 @@ def sherlock(username, verbose=False, tor=False, unique_tor=False):
response_text: Text that came back from request. May be None if response_text: Text that came back from request. May be None if
there was an HTTP error when checking for existence. there was an HTTP error when checking for existence.
""" """
fname = username+".txt" fname = username + ".txt"
if os.path.isfile(fname): if os.path.isfile(fname):
os.remove(fname) os.remove(fname)
print("\033[1;92m[\033[0m\033[1;77m*\033[0m\033[1;92m] Removing previous file:\033[1;37m {}\033[0m".format(fname)) print("\033[1;92m[\033[0m\033[1;77m*\033[0m\033[1;92m] Removing previous file:\033[1;37m {}\033[0m".format(fname))
print("\033[1;92m[\033[0m\033[1;77m*\033[0m\033[1;92m] Checking username\033[0m\033[1;37m {}\033[0m\033[1;92m on: \033[0m".format(username)) print("\033[1;92m[\033[0m\033[1;77m*\033[0m\033[1;92m] Checking username\033[0m\033[1;37m {}\033[0m\033[1;92m on: \033[0m".format(username))
raw = open("data.json", "r", encoding="utf-8")
data = json.load(raw)
# User agent is needed because some sites does not # User agent is needed because some sites do not
# return the correct information because it thinks that # return the correct information because it thinks that
# we are bot # we are bot
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0' 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0'
} }
# Load the data
raw = open("data.json", "r")
data = json.load(raw)
# Allow 1 thread for each external service, so `len(data)` threads total
executor = ThreadPoolExecutor(max_workers=len(data))
# Create session based on request methodology
underlying_session = requests.session()
underlying_request = requests.Request()
if tor or unique_tor:
underlying_request = TorRequest()
underlying_session = underlying_request.session()
# Create multi-threaded session for all requests
session = FuturesSession(executor=executor, session=underlying_session)
# Results from analysis of all sites # Results from analysis of all sites
results_total = {} results_total = {}
# First create futures for all requests. This allows for the requests to run in parallel
for social_network in data: for social_network in data:
# Results from analysis of this specific site # Results from analysis of this specific site
results_site = {} results_site = {}
# Record URL of main site # Record URL of main site
results_site['url_main'] = data.get(social_network).get("urlMain") results_site['url_main'] = data.get(social_network).get("urlMain")
# URL of user on site (if it exists) # Don't make request if username is invalid for the site
url = data.get(social_network).get("url").format(username) regex_check = data.get(social_network).get("regexCheck")
results_site['url_user'] = url if regex_check and re.search(regex_check, username) is None:
# No need to do the check at the site: this user name is not allowed.
print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Illegal Username Format For This Site!".format(social_network))
results_site["exists"] = "illegal"
else:
# URL of user on site (if it exists)
url = data.get(social_network).get("url").format(username)
results_site["url_user"] = url
# This future starts running the request in a new thread, doesn't block the main thread
future = session.get(url=url, headers=headers)
# Store future in data for access later
data.get(social_network)["request_future"] = future
# Reset identify for tor (if needed)
if unique_tor:
underlying_request.reset_identity()
# Add this site's results into final dictionary with all of the other results.
results_total[social_network] = results_site
# Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses
for social_network in data:
# Retrieve results again
results_site = results_total.get(social_network)
# Retrieve other site information again
url = results_site.get("url_user")
exists = results_site.get("exists")
if exists is not None:
# We have already determined the user doesn't exist here
continue
# Get the expected error type
error_type = data.get(social_network).get("errorType") error_type = data.get(social_network).get("errorType")
regex_check = data.get(social_network).get("regexCheck")
# Default data in case there are any failures in doing a request. # Default data in case there are any failures in doing a request.
http_status = "?" http_status = "?"
response_text = "" response_text = ""
if regex_check and re.search(regex_check, username) is None: # Retrieve future and ensure it has finished
#No need to do the check at the site: this user name is not allowed. future = data.get(social_network).get("request_future")
print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Illegal Username Format For This Site!".format(social_network)) r, error_type = get_response(request_future=future,
exists = "illegal" error_type=error_type,
else: social_network=social_network,
r, error_type = make_request(url=url, headers=headers, error_type=error_type, social_network=social_network, verbose=verbose, tor=tor, unique_tor=unique_tor) verbose=verbose)
# Attempt to get request information # Attempt to get request information
try: try:
http_status = r.status_code http_status = r.status_code
except: except:
pass pass
try: try:
response_text = r.text.encode(r.encoding) response_text = r.text.encode(r.encoding)
except: except:
pass pass
if error_type == "message": if error_type == "message":
error = data.get(social_network).get("errorMsg") error = data.get(social_network).get("errorMsg")
# Checks if the error message is in the HTML # Checks if the error message is in the HTML
if not error in r.text: if not error in r.text:
print("\033[37;1m[\033[92;1m+\033[37;1m]\033[92;1m {}:\033[0m".format(social_network), url) print("\033[37;1m[\033[92;1m+\033[37;1m]\033[92;1m {}:\033[0m".format(social_network), url)
write_to_file(url, fname) write_to_file(url, fname)
exists = "yes" exists = "yes"
else: else:
print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Not Found!".format(social_network)) print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Not Found!".format(social_network))
exists = "no" exists = "no"
elif error_type == "status_code": elif error_type == "status_code":
# Checks if the status code of the response is 404 # Checks if the status code of the response is 404
if not r.status_code == 404: if not r.status_code == 404:
print("\033[37;1m[\033[92;1m+\033[37;1m]\033[92;1m {}:\033[0m".format(social_network), url) print("\033[37;1m[\033[92;1m+\033[37;1m]\033[92;1m {}:\033[0m".format(social_network), url)
write_to_file(url, fname) write_to_file(url, fname)
exists = "yes" exists = "yes"
else: else:
print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Not Found!".format(social_network)) print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Not Found!".format(social_network))
exists = "no" exists = "no"
elif error_type == "response_url": elif error_type == "response_url":
error = data.get(social_network).get("errorUrl") error = data.get(social_network).get("errorUrl")
# Checks if the redirect url is the same as the one defined in data.json # Checks if the redirect url is the same as the one defined in data.json
if not error in r.url: if not error in r.url:
print("\033[37;1m[\033[92;1m+\033[37;1m]\033[92;1m {}:\033[0m".format(social_network), url) print("\033[37;1m[\033[92;1m+\033[37;1m]\033[92;1m {}:\033[0m".format(social_network), url)
write_to_file(url, fname) write_to_file(url, fname)
exists = "yes" exists = "yes"
else: else:
print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Not Found!".format(social_network)) print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Not Found!".format(social_network))
exists = "no" exists = "no"
elif error_type == "": elif error_type == "":
print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Error!".format(social_network)) print("\033[37;1m[\033[91;1m-\033[37;1m]\033[92;1m {}:\033[93;1m Error!".format(social_network))
exists = "error" exists = "error"
# Save exists flag # Save exists flag
results_site['exists'] = exists results_site['exists'] = exists

Loading…
Cancel
Save