我正在尝试使用Python请求登录LinkedIn:
import sys import requests from BeautifulSoup import BeautifulSoup payload={ 'session-key' : 'user@email.com', 'session-password' : 'password' } URL='https://www.linkedin.com/uas/login-submit' s=requests.session() s.post(URL,data=payload) r=s.get('http://www.linkedin.com/nhome') soup = BeautifulSoup(r.text) print soup.find('title')
我似乎无法使用此方法登录。我什至尝试在有效负载中使用csrf等,但是会话不是应该为您解决吗?
关于最后一行的注意事项:我使用标题来检查是否已成功登录。(如果我已经登录,则应该看到“ Welcome!| LinkedIn”,而我会看到“世界上最大的专业网络| LinkedIn”)
我想念什么吗?
我修改了一个网络抓取模板,该模板可用于大多数基于Python的抓取需求,以满足您的需求。验证它是否可以使用我自己的登录信息。
它的工作方式是通过模仿浏览器并维护一个cookieJar来存储您的用户会话。也可以与BeautifulSoup一起使用。
注意: 这是Python2版本。我根据要求在下面进一步添加了一个可工作的Python3示例。
import cookielib import os import urllib import urllib2 import re import string from BeautifulSoup import BeautifulSoup username = "user@email.com" password = "password" cookie_filename = "parser.cookies.txt" class LinkedInParser(object): def __init__(self, login, password): """ Start up... """ self.login = login self.password = password # Simulate browser with cookies enabled self.cj = cookielib.MozillaCookieJar(cookie_filename) if os.access(cookie_filename, os.F_OK): self.cj.load() self.opener = urllib2.build_opener( urllib2.HTTPRedirectHandler(), urllib2.HTTPHandler(debuglevel=0), urllib2.HTTPSHandler(debuglevel=0), urllib2.HTTPCookieProcessor(self.cj) ) self.opener.addheaders = [ ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; ' 'Windows NT 5.2; .NET CLR 1.1.4322)')) ] # Login self.loginPage() title = self.loadTitle() print title self.cj.save() def loadPage(self, url, data=None): """ Utility function to load HTML from URLs for us with hack to continue despite 404 """ # We'll print the url in case of infinite loop # print "Loading URL: %s" % url try: if data is not None: response = self.opener.open(url, data) else: response = self.opener.open(url) return ''.join(response.readlines()) except: # If URL doesn't load for ANY reason, try again... # Quick and dirty solution for 404 returns because of network problems # However, this could infinite loop if there's an actual problem return self.loadPage(url, data) def loginPage(self): """ Handle login. This should populate our cookie jar. """ html = self.loadPage("https://www.linkedin.com/") soup = BeautifulSoup(html) csrf = soup.find(id="loginCsrfParam-login")['value'] login_data = urllib.urlencode({ 'session_key': self.login, 'session_password': self.password, 'loginCsrfParam': csrf, }) html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data) return def loadTitle(self): html = self.loadPage("https://www.linkedin.com/feed/") soup = BeautifulSoup(html) return soup.find("title") parser = LinkedInParser(username, password)
2014年6月19日更新: 从首页添加了对CSRF令牌的解析,以用于更新的登录过程。
2015年7月23日更新: 在此处添加Python 3示例。基本上需要替换库位置并删除不推荐使用的方法。它的格式不完美,也不起作用,但是可以正常工作。对不起紧急工作。最后,原理和步骤是相同的。
import http.cookiejar as cookielib import os import urllib import re import string from bs4 import BeautifulSoup username = "user@email.com" password = "password" cookie_filename = "parser.cookies.txt" class LinkedInParser(object): def __init__(self, login, password): """ Start up... """ self.login = login self.password = password # Simulate browser with cookies enabled self.cj = cookielib.MozillaCookieJar(cookie_filename) if os.access(cookie_filename, os.F_OK): self.cj.load() self.opener = urllib.request.build_opener( urllib.request.HTTPRedirectHandler(), urllib.request.HTTPHandler(debuglevel=0), urllib.request.HTTPSHandler(debuglevel=0), urllib.request.HTTPCookieProcessor(self.cj) ) self.opener.addheaders = [ ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; ' 'Windows NT 5.2; .NET CLR 1.1.4322)')) ] # Login self.loginPage() title = self.loadTitle() print(title) self.cj.save() def loadPage(self, url, data=None): """ Utility function to load HTML from URLs for us with hack to continue despite 404 """ # We'll print the url in case of infinite loop # print "Loading URL: %s" % url try: if data is not None: response = self.opener.open(url, data) else: response = self.opener.open(url) return ''.join([str(l) for l in response.readlines()]) except Exception as e: # If URL doesn't load for ANY reason, try again... # Quick and dirty solution for 404 returns because of network problems # However, this could infinite loop if there's an actual problem return self.loadPage(url, data) def loadSoup(self, url, data=None): """ Combine loading of URL, HTML, and parsing with BeautifulSoup """ html = self.loadPage(url, data) soup = BeautifulSoup(html, "html5lib") return soup def loginPage(self): """ Handle login. This should populate our cookie jar. """ soup = self.loadSoup("https://www.linkedin.com/") csrf = soup.find(id="loginCsrfParam-login")['value'] login_data = urllib.parse.urlencode({ 'session_key': self.login, 'session_password': self.password, 'loginCsrfParam': csrf, }).encode('utf8') self.loadPage("https://www.linkedin.com/uas/login-submit", login_data) return def loadTitle(self): soup = self.loadSoup("https://www.linkedin.com/feed/") return soup.find("title") parser = LinkedInParser(username, password)