01-获取代理ip
import requests
def get_proxy_ips():
api='http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
response = requests.get(api)
if response.status_code ==200:
if response.text[0] == '{':
print('获取代理失败,提取太频繁')
else:
return response.text.split('\n')[:-1]
else:
print('请求失败!')
def get_net_data():
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66'
}
ips = get_proxy_ips()
if ips:
proxies={
'http':ips[0],
'https':ips[0]
}
response = requests.get(url, headers=headers,proxies=proxies)
print(response.text)
else:
print('没有成功获取到代理')
if __name__ == '__main__':
get_net_data()
02-使用代理程序优化
import requests
import time
def get_proxy_ips():
api='http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
response = requests.get(api)
if response.status_code ==200:
if response.text[0] == '{':
print('获取代理失败,提取太频繁')
else:
return response.text.split('\n')[:-1]
else:
print('请求失败!')
def get_net_data():
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66'
}
while True:
ips = get_proxy_ips()
if not ips:
time.sleep(1)
continue
for ip in ips:
proxies={
'http':ip,
'https':ip
}
try:
response = requests.get(url, headers=headers,proxies=proxies)
if response.status_code == 200:
return response.text
else:
print('数据请求失败!')
except requests.exceptions.ProxyError:
print('超时,继续请求')
if __name__ == '__main__':
result = get_net_data()
print(result)
03 -bs4的使用
"""
Time:2021/5/26 11:28
Author:Spectre
"""
import requests
from bs4 import BeautifulSoup
def get_net_data(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(response)
def analysis_data(data :str):
bs = BeautifulSoup(data,'lxml')
print(bs)
result = bs.select('#p1')
print(result,len(result),type(result[0]))
result1 = bs.select('p')
print(result1, len(result1),type(result1[0]))
result2 = bs.select_one('p')
print(result2, len(result2))
p1 = bs.select_one('#p1')
print(p1.string)
p2 = bs.select_one('div>p')
print(p2.get_text())
p3 = bs.select_one('#p3')
print(p2.contents)
img= bs.select_one('img')
print(img.attrs)
print(img.attrs['src'])
a = bs.select('a')
print(a[0].attrs['href'])
print('所有P标签:',len(bs.select('p')))
div = bs.select_one('div')
result = div.select('p')
print('div中的p标签:',len(result))
if __name__ == '__main__':
data = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p id='p1' class="title" name="dromouse"><b>The Dormouse's story</b></p>
<img src='./01.jpg' />
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<div>
<p id='p3'>这是<b>一</b>个段部落</p>
<p>这是一个段部落</p>
</div>
</body>
</html>
"""
if data:
analysis_data(data)
04-贝壳网
"""
Time:2021/5/26 15:17
Author:Spectre
"""
import requests
from bs4 import BeautifulSoup
def get_net_data(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(response)
def analysis_data(data :str):
bs = BeautifulSoup(data, 'lxml')
house_li = bs.select('.resblock-list-wrapper>li')
all_house = []
for li in house_li:
house={}
img_src = li.select_one('.lj-lazy').attrs['data-original']
house['img'] = img_src
name = li.select_one('.name').get_text()
house['name'] = name
price = li.select_one('.main-price').get_text().replace('\n','').replace('\xa0','')
house['price'] = price
location = li.select_one('.resblock-location').get_text().strip()
house['location'] = location
all_house.append(house)
print(all_house)
if __name__ == '__main__':
data = get_net_data('https://cd.fang.ke.com/loupan/pg1/')
if data:
analysis_data(data)
else:
print('error')
05-csv文件操作
"""
Time:2021/5/26 16:58
Author:Spectre
"""
import csv
with open('files/test.csv','w',newline='',encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['姓名','性别','年龄','分数'])
writer.writerows([
['张三','男',28,98],
['小明','男',28,98],
['小花','女',18,88],
['小张','男',8,98],
])
with open('files/test02.csv','w',newline='',encoding='utf-8') as f:
writer = csv.DictWriter(f,['name','gender','age','score'])
writer.writerow({'name':'姓名','gender':'性别','age':'年龄','score':'分数'})
writer.writerows([
{'name': '张三', 'gender': '男', 'age': '18', 'score': '98'},
{'name': 'ded', 'gender': '男', 'age': '18', 'score': '98'},
{'name': 'rfe', 'gender': '男', 'age': '18', 'score': '98'},
{'name': 'frrw', 'gender': '男', 'age': '18', 'score': '98'}
])
with open('files/test.csv','r',newline='',encoding='utf-8') as f:
reader = csv.DictReader(f)
for x in reader:
print(dict(x))
print(list(reader))
作业-爬取豆瓣电影排行榜
"""
Time:2021/5/26 15:11
Author:Spectre
"""
import requests
from bs4 import BeautifulSoup
import csv
def get_net_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
print(response)
def analysis_data(data :str):
bs = BeautifulSoup(data, 'lxml')
movie_li = bs.select('.indent .item')
all_movie = []
for li in movie_li:
movie = {}
img_src = li.select_one('img').attrs['src']
movie['img'] = img_src
name = li.select_one('.pl2>a').get_text().strip().replace(' ','').replace('\n','')
movie['name'] = name
a = li.select_one('a').attrs['href']
movie['link'] = a
intro = li.select_one('.pl').get_text()
movie['intro'] = intro
star = li.select_one('.rating_nums').get_text()
movie['star'] = star
comments = li.select_one('.star>.pl').get_text()[1:-1]
movie['comments'] = comments
all_movie.append(movie)
return all_movie
if __name__ == '__main__':
data = get_net_data('https://movie.douban.com/chart')
if data:
result = analysis_data(data)
else:
print('error')
with open('files/doubanmovie.csv','w',newline='',encoding='utf-8') as f:
writer = csv.DictWriter(f,['img','name','link','intro','star','comments'])
writer.writerow({'img':'电影封面','name':'电影名称','link':'电影链接','intro':'上映时间/主演','star':'评分','comments':'评论数'})
for x in result:
writer.writerows([
{'img': x['img'], 'name': x['name'], 'link': x['link'],'intro': x['intro'],'star':x['star'], 'comments':x['comments'],}
])
with open('files/doubanmovie.csv','r',newline='',encoding='utf-8') as f:
reader = csv.DictReader(f)
for x in reader:
print(dict(x))