from bs4 import BeautifulSoup
import requests
def get_url_list(url):
headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16"}
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.content,'html.parser')
url_list = []
list1 = soup.select("#list dl dd a")
for i in list1:
address = 'http://www.qushuba.com/shu15674/'+i.get("href")
url_list.append(address)
return url_list
def get_date(url):
html = requests.get(url)
soup = BeautifulSoup(html.content,'html.parser')
section_name = soup.select("h1")[0].text
print(section_name)
section_text = soup.select("#content")
with open('novel.txt','a',encoding='utf-8') as f:
f.write('\n\n'+section_name+'\n\n')
for x in section_text:
f.write(x.text)
url = 'http://www.qushuba.com/shu15674/'
url_list = get_url_list(url)
for url in url_list:
get_date(url)
# get_date('http://www.qushuba.com/shu15674/8650645.html')
# get_date('http://www.qushuba.com/shu15674/8650645.html')