第1题:爬取捧腹网上笑话段子。
# -*- coding: utf-8 -*-
import sys
import urllib.request as request
from bs4 import BeautifulSoup
def get_jokes(nums):
cnt = 0
page = 1
while cnt < nums:
url = 'http://www.pengfu.com/xiaohua_%d.html' % page
headers = {'User-Agent':'User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
req = request.Request(url, headers=headers)
soup = BeautifulSoup(request.urlopen(req).read(),'lxml')
#print(soup)
#for joke in get_pengfu_results(url):
for joke in soup.find_all(class_="content-img clearfix pt10 relative"):
content = joke.string
try:
if cnt < nums:
text = content.lstrip()
cnt += 1
print("%s. "%cnt + text + "\n")
else:
return
except:
continue
page += 1
if __name__ == '__main__':
nums = input("Please enter number of jokes:")
get_jokes(int(nums))
输出如下: