爬虫

爬虫入门格式:

#12306举例
import requests
url = “https://www.12306.cn/mormhweb/
headers = {
“User-Agent” : “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36”
}
r = requests.get(url,verify = False)
print(r.content.decode(‘utf-8’))

import urllib.request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url = “https://www.12306.cn/mormhweb/
headers = {
“User-Agent” : “Mozilla/s5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36”
}
r = urllib.request.urlopen(url=url verify=False)
print(r.read().decode(‘utf-8’))

常用服务器的端口号:

ftp(文件传输协议):21
ssh(远程连接工具):22
mysql : 3306
mongodb:27017
redis:6379
oracle:1521

爬虫下载网页

import urllib.request
url=…
urllib.rquest.urlretrieve(url=…,filename=’…html’)
爬虫下载图片爬虫下载音频和网页一样的格式。

定制request对象:

headers = {
‘User-Agent’:‘xxxxx’
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
print(response.read().decode(‘utf-8’))

post请求

import urllib.request
import urllib.parse

get和post的区别?

get请求将参数进行编码的时候 只需要使用urlencode即可
#post请求将参数进行编码的时候 不仅仅需要使用urlencode还需要使用encode编码
#get请求会将编码后的参数和url进行拼接 而post请求会将编码后的参数 放在请求对象定制的方法中
url = ‘https://fanyi.baidu.com/sug
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36’
}
data = {
‘kw’:‘first’
}
data = urllib.parse.urlencode(data).encode(‘utf-8’)
request = urllib.request.Request(url=url,headers=headers,data=data)
response = urllib.request.urlopen(request)
print(response.read().decode(‘utf-8’))

带有cookie的,在headers里加cookie参数

handle

随着业务逻辑的复杂
请求对象的定制已经满足不了我们的需求(动态cookie和代理不能使用请求对象的定制)
如果使用代理,就必须使用handle

代理

import urllib.request
url = ‘http://www.baidu.com/s?wd=ip
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36’
}
request = urllib.request.Request(url=url,headers=headers)
handler = urllib.request.ProxyHandler(proxies={‘http’:‘111.11.98.58:9000’})
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode(‘utf-8’)
with open(‘ip.html’,‘w’,encoding=‘utf-8’)as fp:
fp.write(content)

正则爬取

爬取糗事百科图片
import urllib.request
import urllib.parse
import re
def download_url(url):
headers = {
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36’
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
html_string = response.read().decode(‘utf-8’)
return html_string
def parse_data(html_string):
# 正则匹配
parttern = re.compile(r’

. ?<img src="(.?)" alt=.*?
’,re.S)
src_list = parttern.findall(html_string)
return src_list
def save_data(src_list):
print(‘开始下载图片…’)
for src in src_list:
url = ‘http:’ + src
img_name = src.split(’/’)[-1]
file_path = ‘qiubai/’ + img_name
urllib.request.urlretrieve(url,file_path)
print(img_name+’ 下载完成’)
print(‘全部图片下载完成。’)
def main():
url = ‘ https://www.qiushibaike.com/
# 下载数据
html_string = download_url(url)
# 解析数据
src_list = parse_data(html_string)
# 本地化处理
save_data(src_list)
if name == ‘ main’:
main()

xpath 爬取

先安装lxml库
pip install lxml -o pip源
from lxml import etree
浏览器也叫做html解析器

解析本地html文件
html_tree = etree.parse(‘XX.html’)
解析服务器相应文件
html_tree = etree.HTML(rsponse.read().decode(‘utf-8’)

糗事百科抓取用户名
from lxml import etree
import urllib.request
import urllib.parse
import json
url = ‘http://neihanshequ.com/
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
#获取请求到的html字符串
html_string = response.read().decode(‘utf-8’)
#将html字符串转换成etree结构
html_tree = etree.HTML(html_string)
#找出html_etree结构中所有的li标签
li_nodes = html_tree.xpath(’//div[@class=“content”]/ul/li’)
#找出li标签中所有的用户名和用户头像地址并保存到数组
items = []
for li_node in li_nodes:
user_dic = {}
name = li_node.xpath(’.//div[contains(@class,“header”)]/a/div/span[@class=“name”]/text()’)
text = li_node.xpath(’.//div[@class=“detail-wrapper”]/div/a/img/@data-src’)
user_dic[‘name’]=name[0]
user_dic[‘url’]=text[0]
items.append(user_dic)
#将数据写入json文件
data = json.dumps(items,ensure_ascii=False)
with open(‘neihan.json’,‘w’,encoding=‘utf-8’) as fp:
fp.write(data)

beautifulSoup爬取

import urllib.request
from bs4 import BeautifulSoup
from Item import Stock

url = ‘http://quote.stockstar.com/
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36’,
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode(‘gb2312’)

soup = BeautifulSoup(content,‘lxml’)

tr_list = soup.select(’#datalist > tr’)
stocks = []
for tr in tr_list:
code = tr.find_all(‘td’)[0].get_text()
name = tr.find_all(‘td’)[1].get_text()
price = tr.find_all(‘td’)[2].get_text()
stock = Stock(code,name,price)
stocks.append(stock.dict)

import json
str1 = json.dumps(stocks,ensure_ascii=False)
with open(‘stock.json’,‘w’,encoding=‘utf-8’)as fp:
fp.write(str1)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值