百度页面部分爬取
import requests
from lxml import etree
url = "https://www.baidu.com/"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
}
response = requests.get(url, headers= headers)
content = response.content.decode('utf8')
html = etree.HTML(content)
contents = html.xpath("//div[@id='u1']/a/text()")
urls = html.xpath("//div[@id='u1']/a/@href")
egs = []
for content,url in zip(contents,urls):
eg = {}
eg = {
"content":content,
"url":url
}
egs.append(eg)
豆瓣影评爬取
import requests
from lxml import etree
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
}
"""
1.构造十页的url
"""
urls = []
for i in range(0,3,1):
i = i*20
url = "https://movie.douban.com/review/best/?start={}".format(i)
urls.append(url)
"""
2.获取每页所有电影的详细url
"""
detail_urls = []
for url in urls:
response = requests.get(url,headers=headers)
content = response.content.decode('utf8')
html = etree.HTML(content)
detail_url = html.xpath('//h2/a/@href')
detail_urls.append(detail_url)
print(detail_urls)
"""
3.获取每一部电影的影评数据
"""
movies = []
i = 0
for page in detail_urls:
for url in page:
try:
response = requests.get(url, headers=headers)
content = response.content.decode('utf8')
html = etree.HTML(content)
title = html.xpath('//div[@class="subject-title"]/a/text()')[0][2:]
commenter = html.xpath('//header/a/span/text()')[0]
rank = html.xpath('//header//span/@title')[0]
comment = html.xpath('//div[@id="link-report"]//p//text()')
comment = ''.join(comment)
movie = {
"title":title,
"commenter":commenter,
"rank":rank,
"comment":comment,
}
movies.append(movie)
except:
continue
i += 1
print("第{}页已爬取完毕!!".format(i))