First of all, I think it's worth saying that, I know there are a bunch of similar questions but NONE of them works for me...
I'm a newbie on Python, html and web scraper. I'm trying to scrape user information from a website which needs to login first. In my tests I use scraper my email settings from github as examples. The main page is 'https://github.com/login' and the target page is 'https://github.com/settings/emails'
Here are a list of methods I've tried
##################################### Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('https://github.com/login')
for f in br.forms():
print f
br.select_form(nr=0)
# User credentials
br.form['login'] = 'myusername'
br.form['password'] = 'mypwd'
# Login
br.submit()
br.open('github.com/settings/emails').read()
################ Method 2
import urllib, urllib2, cookielib
username = 'myusername'
password = 'mypwd'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
login_data = urllib.urlencode({'username' : username, 'j_password' : password})
opener.open('https://github.com/login', login_data)
resp = opener.open('https://github.com/settings/emails')
print resp.read()
############# Method 3
import urllib
opener = urllib.FancyURLopener()
print opener.open('http://myusername:mypwd@github.com/settings/emails').read()
########## Method 4
import mechanize
import cookielib
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
br.addheaders = [('User-agent', 'Chrome')]
br.add_password('https://github.com/settings/emails', 'myusername', 'mypwd')
br.open('https://github.com/settings/emails')
print br.response().read()
############ Methods 5
from requests import session
payload = {
'action': 'login',
'username': 'myusername',
'password': 'mypwd'
}
with session() as c:
c.post('https://github.com/login', data=payload)
request = c.get('https://github.com/settings/emails')
print request.headers
print request.text
########### Method 6
import requests
from requests.packages.urllib3 import add_stderr_logger
import sys
from bs4 import BeautifulSoup as bs
add_stderr_logger()
s = requests.Session()
s.headers['User-Agent'] = 'Chrome'
username = 'myusername'
password = 'mypwd'
url = 'https://github.com/login'
# after examining the HTML of the website you're trying to log into
# set name_form to the name of the form element that contains the name and
# set password_form to the name of the form element that will contain the password
login = {'login': username, 'password': password}
login_response = s.post(url, data=login)
for r in login_response.history:
if r.status_code == 401: # 401 means authentication failed
print 'error!'
sys.exit(1) # abort
pdf_response = s.get('https://github.com/settings/emails') # Your cookies and headers are automatically included
soup = bs(pdf_response.content)
Also I've read some discussions about differences between HTTP Authentication and cookies. Still none of them worked.
Please help and any help would be appreciated. Thank you very much.