I am trying scrape with BS4 via TOR, using the To Russia With Love tutorial from the Stem project.
I've rewritten the code a bit, using i.a. this answer, and it now looks like this,
SOCKS_PORT=7000
def query(url):
output = io.BytesIO()
query = pycurl.Curl()
query.setopt(pycurl.URL, url)
query.setopt(pycurl.PROXY, 'localhost')
query.setopt(pycurl.PROXYPORT, SOCKS_PORT)
query.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5_HOSTNAME)
query.setopt(pycurl.WRITEFUNCTION, output.write)
try:
query.perform()
return output.getvalue()
except pycurl.error as exc:
return "Unable to reach %s (%s)" % (url, exc)
def print_bootstrap_lines(line):
if "Bootstrapped " in line:
print(term.format(line, term.Color.BLUE))
print(term.format("Starting Tor:\n", term.Attr.BOLD))
tor_process = stem.process.launch_tor_with_config(
tor_cmd = '/Applications/TorBrowser.app/Contents/MacOS/Tor/tor.real',
config = {
'SocksPort': str(SOCKS_PORT),
'ExitNodes': '{ru}',
'GeoIPFile': r'/Applications/TorBrowser.app/Contents/Resources/TorBrowser/Tor/geoip',
'GeoIPv6File' : r'/Applications/TorBrowser.app/Contents/Resources/TorBrowser/Tor/geoip6'
},
init_msg_handler = print_bootstrap_lines,
)
print(term.format("\nChecking our endpoint:\n", term.Attr.BOLD))
print(term.format(query("https://www.atagar.com/echo.php"), term.Color.BLUE))
I am able to Establish a Tor circuit, but at "checking our endpoint", I receive a the following error,
Checking our endpoint:
Traceback (most recent call last):
File "<ipython-input-804-68f8df2c050b>", line 40, in <module>
print(term.format(query('https://www.atagar.com/echo.php'), term.Color.BLUE))
File "/Applications/anaconda/lib/python3.6/site-packages/stem/util/term.py", line 139, in format
if RESET in msg:
TypeError: a bytes-like object is required, not 'str'
What should I change to see the endpoint?
I've temporarily solved it by changing the last line of the above code with,
test=requests.get('https://www.atagar.com/echo.php')
soup = BeautifulSoup(test.content, 'html.parser')
print(soup)
but I'd like to know how to get the 'original' line working.