按广度的方式爬取用户的关注和听众,腾讯微博已经停运了,网上找的登入代码已经都过时了,自己分析不出来,就直接把cookie复制下了,这样就能获取要登录的内容了。
由于停运,只能获取40页的内容,文件格式为[source,target] 表示source 关注 target。由于从source爬取的话可以从关注里找到target,而从target爬取的话会从听众找到source,所以就需要写个去重了。
一小时大概能获取2万条消息。就一路写下去,没用线程。
辣鸡代码如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@auther: Starry
@file: Tencentweibo.py
@time: 2018/7/15 9:50
'''
import requests
from bs4 import BeautifulSoup
from queue import Queue
import time
import datetime
import json
import csv
import os
cookies = {
}
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "api.t.qq.com",
"Pragma": "no-cache",
"Referer": "http://api.t.qq.com/proxy.html",
"rf": "http://t.qq.com/anjianbin1979/following?t=1#u=anjianbin1979&t=1&st=1&p=2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
class TencentWeibo:
COUNT = 0
def __init__(self, start_name, start_title):
self.start_name = start_name
self.start_titile = start_title
self.que = Queue()
self.nameToId = {}
self.current_num = 1
self.visName = []
self.unique = {}
self.init_exe()
def init_exe(self):
if not os.path.exists('information.csv'):
self.csv_information = csv.writer(open('information.csv','a',newline='',encoding='utf-8'),dialect='excel')
self.csv_information.writerow(['id','user','name'])
self.csv_information.writerow([1,self.start_name,self.start_titile])
self.que.put(self.start_name)
self.nameToId[self.start_name] = self.current_num
self.unique[self.nameToId[self.start_name]] = []
else:
with open('information.csv','r',encoding='utf-8') as f:
csvFile = csv.reader(f,dialect='excel')
for index, item in enumerate(csvFile):
if index == 0:continue
self.que.put(item[1])
self.nameToId[item[1]] = int(item[0])
self.current_num = int(item[0])
self.csv_information = csv.writer(open('information.csv', 'a', newline='', encoding='utf-8'),
dialect='excel')
if not os.path.exists('data.csv'):
self.csv_data = csv.writer(open('data.csv', 'a', newline='', encoding='utf-8'), dialect='excel')
self.csv_data.writerow(['Source', 'Target'])
else:
FLAG = 0
with open('data.csv', 'r', encoding='utf-8') as f:
csvFile = csv.reader(f, dialect='excel')
for index, item in enumerate(csvFile):
if index==0:continue
id1, id2 = int(item[0]),int(item[1])
if id1 not in self.unique.keys():
self.unique[id1] = []
if id2 not in self.unique.keys():
self.unique[id2] = []
self.unique[id1].append(id2)
FLAG = min(id1,id2)
while not self.que.empty():
name = self.que.get()
id = self.nameToId[name]
if id == FLAG:
break
else:
self.visName.append(name)
self.csv_data = csv.writer(open('data.csv', 'a', newline='', encoding='utf-8'), dialect='excel')
print('开始爬取啦!!!')
def DealHtml(self,html, Flag, name):
soup = BeautifulSoup(html, 'html.parser')
li = soup.find_all('div', attrs={"class": "userName"})
for chlid in li:
try:
id = chlid.find('a').get('href')[1:]
title = chlid.find('a').string
if id not in self.nameToId.keys():
self.current_num += 1
self.nameToId[id] = self.current_num
self.que.put(id)
self.csv_information.writerow([self.current_num, id, title])
if self.nameToId[id] not in self.unique.keys():
self.unique[self.nameToId[id]] = []
# if self.COUNT == 1000:
# print('已经爬取了%s条消息了'%self.current_num)
# self.COUNT = 0
# self.COUNT += 1
if Flag == 1:
# print("关注",id,title)
if self.nameToId[id] not in self.unique[self.nameToId[name]]:
self.unique[self.nameToId[name]].append(self.nameToId[id])
# print([self.nameToId[name],self.nameToId[id]])
self.csv_data.writerow([self.nameToId[name], self.nameToId[id]])
elif Flag == 2:
# print("粉丝",id, title)
if self.nameToId[name] not in self.unique[self.nameToId[id]]:
self.unique[self.nameToId[id]].append(self.nameToId[name])
# print([self.nameToId[id], self.nameToId[name]])
self.csv_data.writerow([self.nameToId[id], self.nameToId[name]])
except Exception as e:
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e)
def getFans(self, name):
for i in range(1, 41):
try:
ctime = str(int(time.time() * 1000))
url = "http://api.t.qq.com/relations/follow_apollo.php?u={0}&t=2&st=1&p={1}&apiType=14&apiHost=http://api.t.qq.com&_r={2}&g_tk=325301840".format(
name, str(i), ctime)
ret = requests.get(url=url, headers=headers, cookies=cookies,timeout=10)
ret_json = json.loads(ret.text)
if "info" in ret_json.keys():
self.DealHtml(ret_json['info'], 2, name)
else:
break
except Exception as e:
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e)
def getIdol(self, name):
for i in range(1, 41):
try:
ctime = str(int(time.time() * 1000))
url = "http://api.t.qq.com/relations/follow_apollo.php?u={0}&t=1&st=1&p={1}&apiType=14&apiHost=http://api.t.qq.com&_r={2}&g_tk=325301840".format(
name, str(i), ctime)
ret = requests.get(url=url, headers=headers, cookies=cookies,timeout=10)
ret_json = json.loads(ret.text)
if "info" in ret_json.keys():
self.DealHtml(ret_json['info'], 1, name)
else:
break
except Exception as e:
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e)
def start(self):
while not self.que.empty():
visiter = self.que.get()
if visiter not in self.visName:
self.visName.append(visiter)
self.getIdol(visiter)
self.getFans(visiter)
class TencentWeiboArticles:
def __init__(self):
self.que = Queue()
self.IdToInformation = {}
# self.
def start(self):
pass
weibo = TencentWeibo('xie_na','谢娜')
weibo.start()