多线程爬豆瓣用户
import threading
import time
import requests
from pymongo import MongoClient
import json
class myThread(threading.Thread):
def __init__(self, url):
threading.Thread.__init__(self)
self.url = url
client = MongoClient()
self.col = client['douban']['users']
def run(self):
spider(urls=self.url,col=self.col)
def spider(urls,col):
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://www.douban.com',
'Referer': 'https://www.douban.com/gallery/topic/1348/?from=gallery_new_post',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
for url in urls:
resp = requests.get(url, headers=headers)
ret = json.loads(resp.text)
for i in ret['items']:
try:
users = i['target']['status']['author']
except Exception as e:
print(e)
else:
users = i['target']['author']
users['_id'] = users['name']
try:
col.insert(users)
print("======>>>>>" + users['name'])
print('插入成功')
except Exception as e:
print(e)
print('重复')
if __name__ == '__main__':
urls= ['https://m.douban.com/rexxar/api/v2/gallery/topic/84292/items?sort=hot&start={}&count=20&status_full_text=1&guest_only=0&ck=yzY7'.format(i) for i in range(20,4000,20)]
thread1 = myThread(urls[:2000])
thread2 = myThread(urls[2000:])
thread1.start()
thread2.start()