import requests
from bs4 import BeautifulSoup
def check(items):
if len(items) == 0:
return "No Public House"
else:
return items
def got_html(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36'}
response = requests.get(url, headers=headers)
html = response.content
return html
def parse_html(html):
html = BeautifulSoup(html, 'lxml')
results = html.select('#topic_list > form > table > tbody')
results.pop(0)
item_list = []
for result in results:
try:
name = result.select('tr > th > span.checkbox_title > a')[0].get_text().strip()
author = result.select('tr > td.author > cite > a')[0].get_text().strip()
date = result.select('tr > td.author > em')[0].get_text().strip()
callback = result.select('tr > td.nums > cite')[0].get_text().strip()
look_times = result.select('tr > td.nums > em')[0].get_text().strip()
last_public = result.select('tr > td.lastpost > cite > a')[0].get_text().strip()
date_time = result.select('tr > td.lastpost > em')[0].get_text().strip()
item_list.append(name)
item_list.append(author)
item_list.append(date)
item_list.append(callback)
item_list.append(look_times)
item_list.append(last_public)
item_list.append(date_time)
except:
pass
return item_list
if __name__ == '__main__':
strat_url = 'https://bbs.pcauto.com.cn/forum-17442-'
end_url = '.html'
for page in range(1, int(input('终止页码'))+1):
url = strat_url + str(page) + end_url
html = got_html(url)
parse_html(html)