爬取糗事百科段子
import urllib.request
import re
def getContent(url,page):
headers = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode("utf-8")
userPat = 'target="_blank" title="(.*?)">'
contentPat = '<div class="content">(.*?)</div>'
userList = re.compile(userPat,re.S).findall(data)
contentList = re.compile(contentPat,re.S).findall(data)
x = 1
for content in contentList:
content = content.replace("\n","")
name = "content"+str(x)
exec(name+'=content')
x += 1
y = 1
for user in userList:
name = "content"+str(y)
print("用户"+str(page)+str(y)+"是:"+user)
print("内容是:")
exec("print("+name+")")
print("\n")
y += 1
for i in range(1,30):
url = "http://www.qiushibaike.com/8hr/page/"+str(i)
getContent(url,i)