From ae657460c8c069bdc2767f27f02b2d26145c83fb Mon Sep 17 00:00:00 2001 From: "Christopher K. Hoadley" Date: Tue, 22 Jan 2019 20:16:19 -0600 Subject: [PATCH 1/3] Add coverage tests for all sites that use the response URL detection method. This test fails because Sherlock does not handle all of these sites properly. --- tests/all.py | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/tests/all.py b/tests/all.py index 87d3b9b..3d06281 100644 --- a/tests/all.py +++ b/tests/all.py @@ -23,7 +23,7 @@ class SherlockDetectTests(SherlockBaseTest): """ self.username_check(['jack'], ['Twitter'], exist_check=True) - #self.username_check(['dfox'], ['devRant'], exist_check=True) + self.username_check(['dfox'], ['devRant'], exist_check=True) self.username_check(['blue'], ['Pinterest'], exist_check=True) self.username_check(['kevin'], ['Instagram'], exist_check=True) self.username_check(['zuck'], ['Facebook'], exist_check=True) @@ -92,3 +92,51 @@ class SherlockDetectTests(SherlockBaseTest): ) return + + +class SherlockSiteCoverageTests(SherlockBaseTest): + def test_coverage_false_via_response_url(self): + """Test Username Does Not Exist Site Coverage (Via Response URL). + + This test checks all sites with the "response URL" detection mechanism + to ensure that a Username that does not exist is reported that way. + + Keyword Arguments: + self -- This object. + + Return Value: + N/A. + Will trigger an assert if detection mechanism did not work as expected. + """ + + self.username_check(['noonewouldeverusethis7'], + ["Pinterest", "iMGSRC.RU", "Pastebin", + "WordPress", "devRant", "ImageShack", "MeetMe" + ], + exist_check=False + ) + + return + + def test_coverage_true_via_response_url(self): + """Test Username Does Exist Site Coverage (Via Response URL). + + This test checks all sites with the "response URL" detection mechanism + to ensure that a Username that does exist is reported that way. + + Keyword Arguments: + self -- This object. + + Return Value: + N/A. + Will trigger an assert if detection mechanism did not work as expected. + """ + + self.username_check(['blue'], + ["Pinterest", "iMGSRC.RU", "Pastebin", + "WordPress", "devRant", "ImageShack", "MeetMe" + ], + exist_check=True + ) + + return From bb66d6a992debdd27323c848c3ccee8d8d35aa7e Mon Sep 17 00:00:00 2001 From: "Christopher K. Hoadley" Date: Tue, 22 Jan 2019 20:19:34 -0600 Subject: [PATCH 2/3] Update Pinterest and WordPress user URLs to exactly match what the site ends up with. If the request does not have the trailing "/", then the site will forward us to that URL. --- data.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data.json b/data.json index f47095f..5b3122c 100644 --- a/data.json +++ b/data.json @@ -41,7 +41,7 @@ "errorMsg":"page not found" }, "Pinterest": { - "url": "https://www.pinterest.com/{}", + "url": "https://www.pinterest.com/{}/", "urlMain": "https://www.pinterest.com/", "errorType": "response_url", "errorUrl": "https://www.pinterest.com/?show_error=true" @@ -415,7 +415,7 @@ "errorType": "status_code" }, "WordPress": { - "url": "https://{}.wordpress.com", + "url": "https://{}.wordpress.com/", "urlMain": "https://wordpress.com", "errorType": "response_url", "errorUrl": "wordpress.com/typo/?subdomain=", From 65b38592c427a5a2c064009fc8185302fc5ad42b Mon Sep 17 00:00:00 2001 From: "Christopher K. Hoadley" Date: Tue, 22 Jan 2019 20:37:05 -0600 Subject: [PATCH 3/3] Change "response_url" detection strategy completely. Previously, there was a problem with sites that redirect an attempt to view a non-existing username to the main site. For example, if you try to go to https://devrant.com/users/dfoxxxxxxxxx (a user name that does not exist), then we get a redirect to the https://devrant.com/ root of the site. But, the "response_url" checking algorithm was only looking for the configured error URL being included in the response. So, these sites always indicated that the username was not found. Update the "response_url" detection method so that the request does not allow redirects. If we get a 200 response of some type, then the username has been found. However, if we get something like a 302, then we know that the username was not found as we are being redirected. This whole method seems fragile, but I did exhaustively test all of the supported sites, and they all work. So, this change is clearly an improvement. --- sherlock.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/sherlock.py b/sherlock.py index e7fef9d..3cc2850 100644 --- a/sherlock.py +++ b/sherlock.py @@ -208,13 +208,27 @@ def sherlock(username, site_data, verbose=False, tor=False, unique_tor=False, pr if net_info["errorType"] == 'status_code': request_method = session.head + if net_info["errorType"] == "response_url": + #Site forwards request to a different URL if username not + #found. Disallow the redirect so we can capture the + #http status from the original URL request. + allow_redirects = False + else: + #Allow whatever redirect that the site wants to do. + #The final result of the request will be what is available. + allow_redirects = True + # This future starts running the request in a new thread, doesn't block the main thread if proxy != None: proxies = {"http": proxy, "https": proxy} - future = request_method( - url=url, headers=headers, proxies=proxies) + future = request_method(url=url, headers=headers, + proxies=proxies, + allow_redirects=allow_redirects + ) else: - future = request_method(url=url, headers=headers) + future = request_method(url=url, headers=headers, + allow_redirects=allow_redirects + ) # Store future in data for access later net_info["request_future"] = future @@ -290,9 +304,13 @@ def sherlock(username, site_data, verbose=False, tor=False, unique_tor=False, pr exists = "no" elif error_type == "response_url": - error = net_info.get("errorUrl") - # Checks if the redirect url is the same as the one defined in data.json - if not error in r.url: + # For this detection method, we have turned off the redirect. + # So, there is no need to check the response URL: it will always + # match the request. Instead, we will ensure that the response + # code indicates that the request was successful (i.e. no 404, or + # forward to some odd redirect). + if (r.status_code >= 200) and (r.status_code < 300): + # print_found(social_network, url, response_time, verbose) write_to_file(url, f) exists = "yes"