文章目录
爬虫基础
爬虫:通过编写程序来获取网络上的资源
原理:用代码模拟游览器,输入网址,从而获取自己需要的内容
需要的包:urllib
python导入包:
1.import
2.form 包名 import 模块
代码
# -*- codeing utf-8 -*-
# @Time :2021/6/19 10:59
# @Author: 小贺
# @Fime:01.py
#导入需要的包
from urllib.request import urlopen
#存放网址
url = "http://www.baidu.com"
#获取网页代码
readUrl = urlopen(url)
#打印
print(readUrl.read().decode("utf-8"))
#保存文件
with open("baidu.html",mode="w") as f:
f.write(readUrl.read().decode("utf-8"))
print("保存完成")
web请求
1.服务器渲染:在服务器那边把html+数据结合到一起,统一发给游览器
(在页面源代码可以看到数据)
2.客户端渲染:第一次请求html骨架 第二次请求数据,进行数据展示
(在页面源代码看不到数据)
html抓包工具
游览器中的Network(查找数据链接)
HTTP协议
协议:就是两个计算机之间交流的话语,常见的有TCP/IP、SOAP协议HTTP协议、SMPT协议等等
HTTP协议:Hyper Text Transfer Protocol (超文本传输协议)的缩写【万维网】
HTTP协议把一条消息分为三大块内容,无论是请求还是响应都是三块内容
请求:
1.请求行 -》请求方式(Get / Post)
2.请求头 -》服务器的附加信息
3.请求体 -》请求参数
响应:
1.状态行 -》状态码
2.响应头 ——》客户端的附加信息
3.响应题 ——》服务器返回客户端想要的数据(html、json)等
requests 模块
安装
pip install requests
国内(清华源)
pip install i- 清华源 包名
使用requests爬资源
get
import requests
name = input("请输入名字")
url =f'http://www.sogou.com/web?query={name}'
#给服务器发送信息 否则无法爬取
dic = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
}
req = requests.get(url,dic)
print(req.text)
post
import requests
word = input("请输入你要翻译的单词")
url = "https://fanyi.baidu.com/sug"
#传送数据
dat = {
"kw":word
}
req = requests.post(url,data=dat)
print(req.json())
requests加数据爬取
import requests
url = "https://movie.douban.com/j/chart/top_list"
start = input("页码")
#给服务器传送的数据
parameters = {
"type": "24",
"interval_id": "100:90",
"action": "",
"start": start,
"limit": 20
}
#游览器传给服务器的电脑信息
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
}
req = requests.get(url=url, params=parameters, headers=head)
#打印网址
print(req.request.url)
#获取数据
print(req.json())
#***关掉访问
req.close()
数据解析
正则表达式
网址:oschina
re解析
1.findall 返回内容
import re
lst = re.findall(r"\d+","我的手机号10081,他的10080")
print(lst)
2.finditer 返回迭代器
import re
it = re.finditer(r"\d+","我的手机号10081,他的10080")
print(it)
for i in it:
print(i)
print(i.group())
3.search 全文匹配 找到就返回
import re
it = re.search(r"\d+","我的手机号10081,他的10080")
print(it.group())
4.match 从头匹配
import re
it = re.match(r"\d+","100我的手机号10081,他的10080")
print(it.group())
5.预加载正则表达式
import re
obj = re.compile(r"\d+")
it = obj.findall(r"\d+","100我的手机号10081,他的10080")
print(it.group())
6.单独从正则匹配的内容获取文本
import re
# re.S 使 . 可以找出所有数据
obj = re.compile(r"<div class='(?P<class>\d+)'>(?P<name>.*?)</div>", re.S)
# ?P<> 找出匹配正则的数据
s="""
<div class='1'>中国</div>
<div class='2'>中国2</div>
"""
it = obj.finditer(s)
for i in it:
print(i.group("class"), i.group("name"))
案例
import requests
import re
import csv
url = "https://movie.douban.com/top250"
heads = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
}
req = requests.get(url,headers=heads)
page_content = req.text
#解析(正则)
obj = re.compile(
r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>.*?<span class="rating_num" property="v:average">(?P<eva>.*?)</span>'
, re.S)
result = obj.finditer(page_content)
#存放文件(使用csv格式)
f = open("ranking.cvs", mode="w")
cvs = csv.writer(f)
#
for i in result:
#print(i.group("name"))
# print(i.group("eva"))
dic = i.groupdict()
cvs.writerow(dic.values())
print("over")
#关闭
f.close()
req.close()
案例2(verify=false 去掉安全验证)
import requests
import re
domain = "https://www.dydytt.net/index.htm"
req = requests.get(url=domain)
req.encoding = "gb2312"
obj1 = re.compile(
'2021新片精品.*?<ul>(?P<html>.*?)</ul>', re.S)
obj2 = re.compile(r"</a>]<a href='(?P<ul>.*?)'",re.S)
obj3 = re.compile(
r'<title>(?P<name>.*?)迅雷下载.*?下载地址2:<a href="(?P<url>.*?)"', re.S)
result = obj1.finditer(req.text)
collect_url_list = []
for i in result:
html = i.group("html")
#找到子页面
result2 = obj2.finditer(html)
for i2 in result2:
url = "https://www.dydytt.net" + i2.group("ul")
#print(url)
collect_url_list.append(url)
for url in collect_url_list:
req = requests.get(url)
req.encoding = "gb2312"
# print(req.text)
result3 = obj3.search(req.text)
print(result3.group("name"))
print(result3.group("url"))
req.close()
2.bs4解析
import requests
from bs4 import BeautifulSoup
url = "http://www.xinfadi.com.cn/marketanalysis/0/list/1.shtml"
req = requests.get(url)
#解析数据(html.parser让bs4知道是html)
page = BeautifulSoup(req.text,"html.parser")
#找到table标签 并且属性为hq_table
table = page.find("table",attrs={"class":"hq_table"})
#找到tr标签
trs = table.find_all("tr")[0:]
for tr in trs:
tds =tr.find_all("td")
name = tds[0].text
low = tds[1].text
avg = tds[2].text
kg = tds[3].text
dan = tds[4].text
data = tds[5].text
print(name,low,avg,kg,dan,data)
保存到csc文本上
import requests
from bs4 import BeautifulSoup
import csv
url = "http://www.xinfadi.com.cn/marketanalysis/0/list/1.shtml"
req = requests.get(url)
#解析
page = BeautifulSoup(req.text,"html.parser")
table = page.find("table",attrs={"class":"hq_table"})
trs = table.find_all("tr")[0:]
f = open("caijia.cvs",mode="w")
c = csv.writer(f)
for tr in trs:
tds =tr.find_all("td")
name = tds[0].text
low = tds[1].text
avg = tds[2].text
kg = tds[3].text
dan = tds[4].text
data = tds[5].text
print(name,low,avg,kg,dan,data)
c.writerow([name, low, avg, kg, dan, data])
下载图片
import requests
from bs4 import BeautifulSoup
import time
url = "https://www.umei.net/bizhitupian/diannaobizhi/"
req = requests.get(url)
req.encoding = "utf-8"
#print(req.text)
main_page = BeautifulSoup(req.text, "html.parser")
main_list = main_page.find("div", class_="TypeList").find_all("a")
for a in main_list:
href_t = a.get("href")
href = "http://www.umei.net/"+href_t
#拿到子页面
child_req = requests.get(href)
child_req.encoding = "utf-8"
child_req_text = child_req.text
#交给bs4
child_page = BeautifulSoup(child_req_text, "html.parser")
p = child_page.find("p", align="center")
img = p.find("img")
src = img.get("src")
img_resp = requests.get(src)
#print(src)
name = src.split("/")[-1]
print(name)
with open("img/"+name,mode="wb") as f:
f.write(img_resp.content)
time.sleep(1)
3.xpath解析