爬取豆瓣合租房信息
import re
import requests
import json
from requests.exceptions import RequestException
def get_response(url):
try:
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3357.400 QQBrowser/9.6.11858.400'}
response = requests.get(url,headers)
if response.status_code==200:
return response.text
else:
return None
except RequestException:
return None
def get_html(html):
pattern = re.compile('<tr\sclass="">.*?href="(.*?)".*?class="">(.*?)</a>.*?class="">(.*?)</a></td>.*?class="">(.*?)</td>.*?class="time">(.*?)</td>.*?</tr>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
"URL":item[0],
"title":item[1],
"ID":item[2],
"回帖数":item[3],
"date":item[4]}
def write_file(item):
with open("file.txt", 'a', encoding='utf-8')as f: