爬虫

最新推荐文章于 2025-03-15 20:17:26 发布

zeDKingdsf

最新推荐文章于 2025-03-15 20:17:26 发布

阅读量192

点赞数 1

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/qq_36193812/article/details/88045585

爬虫入门格式：

#12306举例
import requests
url = “https://www.12306.cn/mormhweb/”
headers = {
“User-Agent” : “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36”
}
r = requests.get(url,verify = False)
print(r.content.decode(‘utf-8’))

import urllib.request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url = “https://www.12306.cn/mormhweb/”
headers = {
“User-Agent” : “Mozilla/s5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36”
}
r = urllib.request.urlopen(url=url verify=False)
print(r.read().decode(‘utf-8’))

常用服务器的端口号：

ftp(文件传输协议）：21
ssh（远程连接工具):22
mysql : 3306
mongodb:27017
redis:6379
oracle:1521

爬虫下载网页

import urllib.request
url=…
urllib.rquest.urlretrieve(url=…,filename=’…html’)
爬虫下载图片爬虫下载音频和网页一样的格式。

定制request对象：

headers = {
‘User-Agent’:‘xxxxx’
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
print(response.read().decode(‘utf-8’))

post请求

import urllib.request
import urllib.parse

get和post的区别?

get请求将参数进行编码的时候只需要使用urlencode即可
#post请求将参数进行编码的时候不仅仅需要使用urlencode还需要使用encode编码
#get请求会将编码后的参数和url进行拼接而post请求会将编码后的参数放在请求对象定制的方法中
url = ‘https://fanyi.baidu.com/sug’
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36’
}
data = {
‘kw’:‘first’
}
data = urllib.parse.urlencode(data).encode(‘utf-8’)
request = urllib.request.Request(url=url,headers=headers,data=data)
response = urllib.request.urlopen(request)
print(response.read().decode(‘utf-8’))

带有cookie的，在headers里加cookie参数

handle

随着业务逻辑的复杂
请求对象的定制已经满足不了我们的需求（动态cookie和代理不能使用请求对象的定制）
如果使用代理，就必须使用handle

代理

import urllib.request
url = ‘http://www.baidu.com/s?wd=ip’
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36’
}
request = urllib.request.Request(url=url,headers=headers)
handler = urllib.request.ProxyHandler(proxies={‘http’:‘111.11.98.58:9000’})
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode(‘utf-8’)
with open(‘ip.html’,‘w’,encoding=‘utf-8’)as fp:
fp.write(content)

正则爬取

爬取糗事百科图片
import urllib.request
import urllib.parse
import re
def download_url(url):
headers = {
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36’
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
html_string = response.read().decode(‘utf-8’)
return html_string
def parse_data(html_string):
# 正则匹配
parttern = re.compile(r’

. ?<img src="(.?)" alt=.*?

’,re.S)
src_list = parttern.findall(html_string)
return src_list
def save_data(src_list):
print(‘开始下载图片…’)
for src in src_list:
url = ‘http:’ + src
img_name = src.split(’/’)[-1]
file_path = ‘qiubai/’ + img_name
urllib.request.urlretrieve(url,file_path)
print(img_name+’ 下载完成’)
print(‘全部图片下载完成。’)
def main():
url = ‘ https://www.qiushibaike.com/’
# 下载数据
html_string = download_url(url)
# 解析数据
src_list = parse_data(html_string)
# 本地化处理
save_data(src_list)
if name == ‘ main’:
main()

xpath 爬取

先安装lxml库
pip install lxml -o pip源
from lxml import etree
浏览器也叫做html解析器

解析本地html文件
html_tree = etree.parse(‘XX.html’)
解析服务器相应文件
html_tree = etree.HTML(rsponse.read().decode(‘utf-8’)

糗事百科抓取用户名
from lxml import etree
import urllib.request
import urllib.parse
import json
url = ‘http://neihanshequ.com/’
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
#获取请求到的html字符串
html_string = response.read().decode(‘utf-8’)
#将html字符串转换成etree结构
html_tree = etree.HTML(html_string)
#找出html_etree结构中所有的li标签
li_nodes = html_tree.xpath(’//div[@class=“content”]/ul/li’)
#找出li标签中所有的用户名和用户头像地址并保存到数组
items = []
for li_node in li_nodes:
user_dic = {}
name = li_node.xpath(’.//div[contains(@class,“header”)]/a/div/span[@class=“name”]/text()’)
text = li_node.xpath(’.//div[@class=“detail-wrapper”]/div/a/img/@data-src’)
user_dic[‘name’]=name[0]
user_dic[‘url’]=text[0]
items.append(user_dic)
#将数据写入json文件
data = json.dumps(items,ensure_ascii=False)
with open(‘neihan.json’,‘w’,encoding=‘utf-8’) as fp:
fp.write(data)

beautifulSoup爬取

import urllib.request
from bs4 import BeautifulSoup
from Item import Stock

url = ‘http://quote.stockstar.com/’
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36’,
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode(‘gb2312’)

soup = BeautifulSoup(content,‘lxml’)

tr_list = soup.select(’#datalist > tr’)
stocks = []
for tr in tr_list:
code = tr.find_all(‘td’)[0].get_text()
name = tr.find_all(‘td’)[1].get_text()
price = tr.find_all(‘td’)[2].get_text()
stock = Stock(code,name,price)
stocks.append(stock.dict)

import json
str1 = json.dumps(stocks,ensure_ascii=False)
with open(‘stock.json’,‘w’,encoding=‘utf-8’)as fp:
fp.write(str1)