第七周第一节实践课,学习的内容是爬虫。之情一直对爬虫有兴趣但因为懒所以没有付诸实践,现在正是大好机会。
【目标】
爬取猫眼网站前100的电影,保存到csv中,并保存他们的封面。
【遇到的主要问题】
- 正则表达式。
- 别忘了创建保存图片用的文件夹。
【代码存档】
import csv
import re
import requests
from requests import RequestException
# 2.主调度程序
def main():
# 起始URL
start_url="http://maoyan.com/board/4"
for i in range(0,100,10) :
# 获取响应文本内容
html = get_one_page(url=start_url, offset=i)
if html is None:
print("链接:%s?offset=%s异常".format(start_url,i))
continue
pass
for item in parse_one_page(html=html):
store_data(item)
download_thumb(item["title"],item["thumb"])
pass
pass
# 3.页面内容获取
# 请求一个页面返回响应内容
def get_one_page(url,offset):
print("开始回馈!")
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
response = requests.get(url=url, headers=headers,params={"offset":offset})
if response.status_code==200:
return response.text
pass
else:
return None
pass
pass
except RequestException as e:
return None
pass
pass
# 4.页面解析
# 解析一个页面
def parse_one_page(html):
print("解析中!")
pattern = '<dd>.*?board-index.*?">(\d+)</i>.*?data-src="(.*?)".*?/>.*?movie-item-info.*?title="(.*?)".*?star">' + \
'(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(\d+)</i>.*?</dd>'
# re.S匹配多行
regex = re.compile(pattern, re.S)
items = regex.findall(html)
for item in items:
yield {
'index': item[0],
'thumb': get_large_thumb(item[1]),
'title': item[2],
'actors': item[3].strip()[3:],
'release_time': get_release_time(item[4].strip()[5:]),
'area': get_release_area(item[4].strip()[5:]),
'score': item[5] + item[6]
}
pass
pass
# 获取上映时间
def get_release_time(data):
print("It`s high noon!")
pattern = '^(.*?)(\(|$)'
regex = re.compile(pattern)
w = regex.search(data)
return w.group(1)
# 获取上映地区
def get_release_area(data):
print("Where?Here!")
pattern = '.*\((.*)\)'
regex = re.compile(pattern)
w = regex.search(data)
if w is None:
return'未知'
return w.group(1)
# 获取封面大图
def get_large_thumb(url):
print("Get Large Pic!")
pattern = '(.*?)@.*?'
regex = re.compile(pattern)
w = regex.search(url)
return w.group(1)
# 存储数据
def store_data(item):
print("Loading")
with open('movie.csv','a',newline='',encoding='utf-8') as data_csv:
# dialect为打开csv文件的方式,默认是excel,delimiter="\t"参数指写入的时候的分隔符
try:
csv_writer = csv.writer(data_csv)
csv_writer.writerow([item['index'], item['thumb'], item['title'], item['actors'],item['release_time'],item['area'],item['score']])
except Exception as e:
print(e)
print(item)
# 下载封面图
def download_thumb(title,url):
print("Download Pic!")
try:
response = requests.get(url=url)
# 获取二进制数据
with open('thumb/'+title+'.jpg', 'wb') as f:
f.write(response.content)
f.close()
except RequestException as e:
print(e)
pass
if __name__ == '__main__':
main()
print("爬取完成!")
pass