import re
import urllib.request
from bs4 import BeautifulSoup
def get_html(url):
page = urllib.request.urlopen(url)
html = page.read().decode('utf-8')
return html
def get_img(html):
reg = r'src="(.*?jpg)"' # 正则得到图片地址
imgre = re.compile(reg) # 把正则编译成一个正则对象
imglist = re.findall(imgre, html)
# print(imglist)
# 读取html中包含imgre的数据
# 把筛选的图片地址for循环遍历并保存到本地
# 核心是urllib.urlretrieve()方法,直接将远程数据下载到本地,图片通过x一次递增命名
x = 0
for imgurl in imglist:
# imgurl = 'https:'+imgurl
urllib.request.urlretrieve(imgurl, 'F:/python_Projectpic/%s.jpg' % x)
x += 1
return imglist
def get_text(html):
soup = BeautifulSoup(html) # 建立一个beautifulsoup类
# namelist = soup("span") # 通过标签筛选文字信息
# for name in namelist:
# print(name.get_text())
f = open('F:/python_Projectpic/movies.txt', 'w', encoding='utf-8')
txt_title = soup.findAll(attrs={"title", "span"}) # 标题
txt_body = soup.findAll(attrs={"span", "inq"}) # 经典内容
for txt in txt_title:
print(txt.string)
f.write(txt.string+'\r\n')
f.close()
return txt_title
html = get_html("https://movie.douban.com/top250")
# print(get_img(html))
print(get_text(html))
效果图:
笔记:
1.BeautifulSoup soup.findAll方法(通过标签筛选内容)
标签参数tag:
可以传递一个标签的名称或多个标签名称组成的Python列表做标签参数。
如:findAll({"tag1","tag2","tag3","tag4"})
属性参数attributes是用一个Python字典封装一个标签的若干属性和对应的属性值。
如:返回HTML文档中attribute1和attribute2属性的tag标签
findAll("tag",{"classs":{"attribute1","attribute2"}})
2.python中txt文件的编码转换问题utf-8转gbk
f.open(sys.argv[1], 'r', encoding='utf-8')