简单的知乎用户信息爬取。
用到python3.5,mysql数据库,需要自行准备好环境
代码在windows上测试成功(没钱买mac,逃)
插入数据库时,写了两个版本,一个正常查询,一个协程查询(主要比较看看这两者的性能如何)
由于是个python渣,代码有点乱。
模拟登陆知乎:
解析出用户基本信息(由于知乎页面随时在变,所以程序不能用多半是这里解析出了的问题。请自行解析,可以参照 BeautifulSoup):
以下为具体操作(数据库中持续存储):
数据库截图:
用到python3.5,mysql数据库,需要自行准备好环境
代码在windows上测试成功(没钱买mac,逃)
插入数据库时,写了两个版本,一个正常查询,一个协程查询(主要比较看看这两者的性能如何)
由于是个python渣,代码有点乱。
好了不多废话,现在开始:
全局变量:
# 提交头数据
headers = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0',
"Referer": "http://www.zhihu.com/",
'Host': 'www.zhihu.com',
}
# 保存cookie的文件名
filename = 'cookie'
# 建立一个session
session = requests.Session()
# 建立LWPCookieJar实例,可以存Set-Cookie3类型的文件。
session.cookies = http.cookiejar.LWPCookieJar(filename)
模拟登陆知乎:
# 登陆
def login():
# 第一次需要输入自己的账号密码
username = input('输入账号:')
password = input('输入密码:')
#有@符号为邮箱登陆
if "@" in username:
print('使用邮箱登录中...')
url = 'https://www.zhihu.com/login/email'
data = {'_xsrf': get_xsrf(),
'password': password,
'remember_me': 'true',
'email': username
}
else:
print('使用手机登录中...')
url = 'http://www.zhihu.com/login/phone_num'
data = {'_xsrf': get_xsrf(),
'password': password,
'remember_me': 'true',
'phone_num': username
}
# 若不用验证码,直接登录
try:
result = session.post(url, data=data, headers=headers)
print((json.loads(result.text))['msg'])
# 要用验证码,post后登录
except:
data['captcha'] = get_captcha()
print(data)
result = session.post(url, data=data, headers=headers)
print(result.text)
print((json.loads(result.text))['msg'])
# 保存cookie到本地
session.cookies.save(ignore_discard=True, ignore_expires=True)
获取xsrf:
# 获取xsrf
def get_xsrf():
response = session.get('https://www.zhihu.com', headers=headers)
html = response.text
get_xsrf_pattern = re.compile(r'<input type="hidden" name="_xsrf" value="(.*?)"')
_xsrf = re.findall(get_xsrf_pattern, html)[0]
return _xsrf
获取验证码:
# 获取验证码
def get_captcha():
t = str(int(time.time() * 1000))
captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
response = session.get(captcha_url, headers=headers)
with open('cptcha.gif', 'wb') as f:
f.write(response.content)
im = Image.open('cptcha.gif')
im.show()
captcha = input('请输入验证码: ')
return captcha
解析出用户基本信息(由于知乎页面随时在变,所以程序不能用多半是这里解析出了的问题。请自行解析,可以参照 BeautifulSoup):
# 获取用户基本信息
def get_userInfo(userID):
user_url = 'https://www.zhihu.com/people/' + userID
response = session.get(user_url, headers=headers)
# 由于我在window上编程所以只能用html5lib,有钱了去买了个mac
soup = BeautifulSoup(response.content, 'html5lib')
# 打印出页面
# with open('zhihuprifile.html', 'wb') as f:
# f.write(response.content)
d = {}
# userId = soup.find("a",class_="Tabs-link")["href"].split("/")[2]
d['userId'] = userID
try:
nickname = soup.find_all('span', {'class': 'ProfileHeader-name'})[0].string
except:
nickname = "None"
d['nickname'] = nickname
try:
word = soup.find('span', class_="RichText ProfileHeader-headline").string
if word == None:
word = 'None'
except:
word = "None"
d['word'] = word
try:
business = soup.find_all('div', {'class': 'ProfileHeader-iconWrapper'})[0].next_sibling
if business == None:
business = 'None'
except:
business = 'None'
d['business'] = business
try:
company = soup.find_all('div', {'class': 'ProfileHeader-divider'})[0].next_sibling
if company == None:
company = 'None'
except:
company = 'None'
d['company'] = company
try:
location = soup.find_all('div', {'class': 'ProfileHeader-divider'})[1].next_sibling
if location == None:
location = 'None'
except:
location = "None"
d['location'] = location
try:
school = soup.find_all('div', {'class': 'ProfileHeader-iconWrapper'})[1].next_sibling
if school == None:
school = 'None'
except:
school = 'None'
d['school'] = school
try:
subject = soup.find_all('div', {'class': 'ProfileHeader-divider'})[2].next_sibling
if subject == None:
subject = 'None'
except:
subject = 'None'
d['subject'] = subject
try:
# 分割错误说明没有“回答”,会报错
answers = soup.find('div', {'class': 'IconGraf-iconWrapper'}).next_sibling.split(' ')[1]
except:
answers = None
if answers == None:
answers = 0
d['answers'] = answers
try:
followees = soup.find_all('div', {'class': 'Profile-followStatusValue'})[0].string
except:
followees = None
if followees == None:
followees = 0
# print('followees: %s' % followees)
d['followees'] = followees
try:
followers = soup.find_all('div', {'class': 'Profile-followStatusValue'})[1].string
except:
followers = None
if followers == None:
followers = 0
d['followers'] = followers
return d
# 获取关注者的主页url,只获取前三个
def followeesUrl(userId):
user_url = 'https://www.zhihu.com/people/' + userId + "/following"
response = session.get(user_url, headers=headers)
# 由于我在windows上编程所以只能用html5lib,有钱了去买了个mac
soup = BeautifulSoup(response.content, 'html5lib')
# with open('following.html', 'wb') as f:
# f.write(response.content)
urls = soup.find_all("div", {'aria-haspopup': "true"})
# 保存url,去掉重复的
urllist = set([])
for url in urls:
urllist.add(url.a["href"])
# 拼接为字符串返回
saveUrl = ''
for u in urllist:
if saveUrl != '':
saveUrl = saveUrl + "," + u
else:
saveUrl = u
return saveUrl
以下是对mysql操作:
# 数据库链接信息,填入自己的数据库信息
conn = mysql.connector.connect(host='localhost', user='****', password='****', database='zhihu')
# 存取数据到数据库
def saveInfo(info):
cursor = conn.cursor()
data = [str(info.get("userId")), str(info.get("nickname")), str(info.get("word")), str(info.get("business")),
str(info.get("company")), str(info.get("location")), str(info.get("school")), str(info.get("subject")),
int(info.get("answers")), int(info.get("followers")), int(info.get("followees")), info.get("f_url")]
try:
cursor.execute(
"insert into zhihu(userId,nickname,word,business,company,location,school, subject,answers,followers,followees,f_url) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
data)
except:
#print("已经存储")
pass
conn.commit()
cursor.close()
# 修改状态,1表示已经查询过
def changeSate(userId):
cursor = conn.cursor()
# sql = "updata zhihu set flag = 1 where userId = " + userId
# cursor.execute(sql)
data = [userId]
cursor.execute("update zhihu set flag = 1 where userId = %s", data)
conn.commit()
cursor.close()
# 查询url
def selectOneUrl():
cursor = conn.cursor()
flag = [0, ]
# 查询出一条未爬取的数据即可
cursor.execute("select userId,f_url from zhihu where flag = %s limit 1", flag)
data = cursor.fetchall()
userId = data[0][0]
url = data[0][1]
urlList = []
for u in url.split(","):
urlList.append(u)
# 放入对象中返回
urlOne = SelectOne(userId, urlList)
conn.commit()
cursor.close()
return urlOne
以下为具体操作(数据库中持续存储):
#-------------------操作方法----------------------------
def threadingExecution():
# 从数据库中查询没有被爬取的数据
urltemp = None
try:
urltemp = selectOneUrl()
except:
print("用户查询完毕")
userId = urltemp.get_userId()
urls = urltemp.get_urlList()
if urls != None and urls != '':
for u in urls:
try:
id = u.split("/")[2]
except:
continue
info = get_userInfo(id)
nickname = info['nickname']
if nickname == 'None':
continue
print(nickname)
info['f_url'] = followeesUrl(info['userId'])
saveInfo(info)
# 睡5秒后再请求,我怕请求太快,被限制
time.sleep(5)
changeSate(userId)
# 批量获取信息
def selectMessage():
while True:
#开始运行时间
start = time.time()
t = threading.Thread(target=threadingExecution())
t.setDaemon(True)
t.start()
t.join(30)
#结束时间
end = time.time()
#超时退出循环
if (end-start) > 29:
break
print("超时----")
#退出循环后在次调用
#睡三分钟后重连
time.sleep(180)
session.cookies.load(filename=filename, ignore_discard=True)
selectMessage()
# 存放从数据库中查询出来的信息
class SelectOne:
def __init__(self, userId, urlList):
self.__userId = userId
self.__urlList = urlList
def get_userId(self):
return self.__userId
def get_urlList(self):
return self.__urlList
def set_userId(self, userId):
self.__userId = userId
def set_urlList(self, urlList):
self.__urlList = urlList
数据库截图:
最后,代码里面都有详细的注释。获取sql文件,和完整代码请到github上获取
本文介绍了一款基于Python的知乎用户信息爬虫,通过模拟登录实现用户信息抓取,并利用MySQL进行数据存储。该爬虫能抓取用户的昵称、简介等基本信息,以及关注者信息。
9972

被折叠的 条评论
为什么被折叠?



