偶尔看到一个抓取linkedin数据py代码,真是简单,发请求,只需要两句话
import request
request.get(url)
牛啊,这在别的语言至少10行啊。
解析html,etree,直接扫dom节点取数据,这是方便。看一段登录的代码:
def login(laccount, lpassword):
""" 根据账号密码登录linkedin """
s = requests.Session()
r = s.get('https://www.linkedin.com/uas/login')
tree = etree.HTML(r.content)
loginCsrfParam = ''.join(tree.xpath('//input[@id="loginCsrfParam-login"]/@value'))
csrfToken = ''.join(tree.xpath('//input[@id="csrfToken-login"]/@value'))
sourceAlias = ''.join(tree.xpath('//input[@id="sourceAlias-login"]/@value'))
isJsEnabled = ''.join(tree.xpath('//input[@name="isJsEnabled"]/@value'))
source_app = ''.join(tree.xpath('//input[@name="source_app"]/@value'))
tryCount = ''.join(tree.xpath('//input[@id="tryCount"]/@value'))
clickedSuggestion = ''.join(tree.xpath('//input[@id="clickedSuggestion"]/@value'))
signin = ''.join(tree.xpath('//input[@name="signin"]/@value'))
session_redirect = ''.join(tree.xpath('//input[@name="session_redirect"]/@value'))
trk = ''.join(tree.xpath('//input[@name="trk"]/@value'))
fromEmail = ''.join(tree.xpath('//input[@name="fromEmail"]/@value'))
payload = {
'isJsEnabled': isJsEnabled,
'source_app': source_app,
'tryCount': tryCount,
'clickedSuggestion': clickedSuggestion,
'session_key': laccount,
'session_password': lpassword,
'signin': signin,
'session_redirect': session_redirect,
'trk': trk,
'loginCsrfParam': loginCsrfParam,
'fromEmail': fromEmail,
'csrfToken': csrfToken,
'sourceAlias': sourceAlias
}
s.post('https://www.linkedin.com/uas/login-submit', data=payload)
return s
学习用python,太强大了,一顶十啊