Python感官上与Java的区别:
- 变量的定义不需要指出类型,而是根据值来确定类型.
- 函数的定义,用def 不需要定义返回参数,不用{},而是用:
这次简单编写抓取页面案例,可以看出python的优势在于语法简洁,且拥有特定场景相对成熟的库可以使用,便于快速实现功能,下面贴代码main.py和结构
# 简单的爬虫程序,爬取Top250
from bs4 import BeautifulSoup
import re
import urllib.error
import urllib.request
import xlwt
findLink = re.compile(r'<a href="(.*?)">')
findImsSrc = re.compile(r'<img .*src="(.*?)"', re.S)
findTitle = re.compile(r'<span class="title">(.*)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findSimpleSum = re.compile(r'<span class="inq">(.*)</span>')
findRelatedContent = re.compile(r'<p class="">(.*?)</p>', re.S)
def getData(url):
dataList = []
# urllib获取内容
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
# bs4解析,提取内容
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
data = []
item = str(item) # 转化为字符串
# 链接
link = re.findall(findLink, item)[0]
data.append(link)
# 图片
img = re.findall(findImsSrc, item)[0]
data.append(img)
# 片名
titles = re.findall(findTitle, item)
if len(titles) == 2:
cTitle = titles[0]
fTitle = titles[1].replace("/", "")
data.append(cTitle)
data.append(fTitle)
else:
cTitle = titles[0]
fTitle = ""
data.append(cTitle)
data.append(fTitle)
rating = re.findall(findRating, item)
data.append(rating)
# 概述可能不存在
simpleSum = re.findall(findSimpleSum, item)
if len(simpleSum) != 0:
data.append(simpleSum)
else:
data.append(" ")
relatedContent = re.findall(findRelatedContent, item)
data.append(relatedContent)
dataList.append(data)
return dataList
# 程序运行入口
if __name__ == "__main__":
dataList = []
# 爬取地址
baseUrl = "这里需要自己修改填入地址"
for i in range(0, 250, 25):
url = baseUrl + str(i)
dataListTemp = getData(url)
dataList = dataList + dataListTemp
# xlwt导出内容为excel
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet("sheet1", cell_overwrite_ok=True)
col = ["电影链接", "图片链接", "影片", "评分", "概述", "影片其他信息"]
for i in range(0, 6):
sheet.write(0, i, col[i]) # 首行列名
for i in range(0, 250):
data = dataList[i]
cTitle = data[2]
fTitle = data[3]
if len(fTitle) == 0:
titleUnion = cTitle
else:
titleUnion = cTitle + "/" + fTitle
for j in range(0, 6):
if j < 2:
sheet.write(i + 1, j, dataList[i][j])
elif j == 2:
sheet.write(i + 1, j, titleUnion)
else:
sheet.write(i + 1, j, dataList[i][j+1])
book.save("fileTop250.xls")