# -*- coding: utf-8 -*-
"""
Created on Tue Nov 27 09:25:45 2018
@author: Administrator
"""
from urllib import request
#直接用request方法读取网站
url = 'http://www.baidu.com'
respo = request.urlopen(url)
print(respo.read().decode('utf-8'))
# 使用header读取网站内容
url = 'http://tieba.baidu.com'
headers = {'User-Agent':
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/64.0.3282.186 Mobile Safari/537.36'}
request1 = request.Request(url = url, headers = headers)
respo1 = request.urlopen(request1)
print(respo1.read().decode('utf-8'))
import requests
#get 请求
re = requests.get('http://www.baidu.com')
re.status_code
re.headers
re.cookies
#下载图片,并且将图片保存下来.
req = requests.get('https://cn.bing.com/rs/2Q/rh/ic/795cdd86/f50c5139.png')
with open(r'C:\Users\Administrator\Desktop\月报周报\t.png', 'wb') as f:
f.write(req.content)
f.close()
#带参数的get请求
r = requests.get('https://github.com/Ranxf') # 最基本的不带参数的get请求
r1 = requests.get(url='http://dict.baidu.com/s', params={'wd': 'python'}) #params 要用字典的形式传入
type(r1.text)
type(r1.content)
r1.content
r1.text
r1.content.decode('utf-8')
r1.url
#保存二进制文件
'''
wb: 以二进制格式打开一个文件只用于写入。如果该文件已经存在,则将其覆盖。如果该文件不存在,创建新文件。
w: 即为write
b: 二进制模式
f: file
'''
#保存照片(。png文件?请问怎么批量下载png文件,这个url怎么传入?)
url_png = 'https://static.zhihu.com/heifetz/guide-download-bg.33f5c9a03cb9a6cd685b.png'
r_png = requests.get(url_png, headers = headers)
with open(r'C:\Users\Administrator\Desktop\python_work\爬虫load\t.png', 'wb') as f:
f.write(r_png.content)
f.close()
r1.cookies #请问这个cookie是什么?为什么要获取cookies值?cookies 值在我们爬虫的过程中很重要吗?
for key, value in r1.cookies.items():
print(key+'='+value)
from requests.packages import urllib3
urllib3.disable_warmings()
r = requests.get('http://www.bing.com', verify = False) #证书验证
r.elements
r.status_code
url = 'https://www.zhihu.com/signup'
param = {'next' : '%2F'}
#认证设置
zhihu = requests.get(url = url, params = param, auth = ('user','password'))
zhihu.status_code
zhihu.content.decode('utf-8')
#用状态码来判断
import requests
r_cod = requests.get('http://www.baidu.com')
print('successful connnerct') if r_cod.status_code == 200 else print('unconnnect')
url = 'https://www.12306.cn'
requests.get(url)
#正则表达式
import re
#1.re,match()从开头字符串开始匹配,开头字符串不匹配则无法匹配成功。 注意与re.search()对比学习
content = 'hello 123 4567 word , this is an example'
result = re.match('^hello\s\d{3}\s\d{4}.*example$',content)
print(result)
print(result.group(1))
print(result.span())
result = re.match('^he.*?(\d+\s\d+).*ple$', content)
print(result)
print(result.group(1))
#2.re.search 返回第一个匹配成功的字符串
content = 'string hello 123 4567 word ,this is an example'
result1 = re.match('he.*(\d+\s\d+).*ple$', content)
result2 = re.search('he.*?(\d+\s\d+).*ple$', content) # 非贪婪模式
result3 = re.search('he.*(\d+\s\d+).*ple$', content) # 贪婪模式
print(result1)
print(result2)
print(result2.group(1))
print(result3.group(1))
#re.sub,替换字符串每个成功匹配的字符后返回替换后的字符
#匹配字符串中的数字,并将数字转换成replacement
result4 = re.sub('\d','replacement', content)
print(result4)
result5 = re.sub('\d+','replacement', content)
print(result5)
#re.compile()将正则表达式译成正则表达式对象
pattern = re.compile('hello.*?(\d+\s\d+).*example$')
result6 = re.search(pattern, content)
print(result6)
print(result6.group(1))
'''
小知识:
resp.text返回的是Unicode型的数据。
resp.content返回的是bytes型也就是二进制的数据。
也就是说,如果你想取文本,可以通过r.text。
如果想取图片,文件,则可以通过r.content。
(resp.json()返回的是json格式数据)
'''
#excample 豆瓣主页上book的相关信息
import requests
import re
import pandas as pd
url = 'https://book.douban.com/'
response = requests.get(url)
html = response.text
print(html)
pattern = re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?class="more-meta".*?class="author">(.*?)</span>.*?class="year">(.*?)</span>.*?class="publisher">(.*?)</span>',
re.S)
douban = re.findall(pattern, html)
df = []
df = []
for i in douban:
a = []
for j in i:
print(j.strip())
a.append(j.strip())
df.append(a)
df = pd.DataFrame(df,columns = ['url','bookName','author','PubishDate','bookSource'])
#课后练习爬取某P2P平台产品数据
ht ='''
<div class="prolistMsg pad_t20 clearfix">
<a href="https://member.niwodai.com/inteBid/inteBidPeriodDetail.do?period_id=ADZUNlYwVTkFY1RgUDleago7VWwCaAJjBTcFMQBmU28=&plan_sign=J2&nwd=1" target="_blank">
<ul class="prolistType wid_w1120 bor_b pad_l30 pad_r30 pad_b20 clearfix">
<li class="fl wid_w280 ">
<p class="lin_40 pad_t5">
<strong class="fs_18 fc_3 mar_r5">有道智投-2月期</strong>
</p>
<span class="fc_9">债权持满60日可免费转让</span>
</li>
<li class="fl wid_w260">
<p class="mar_t5 fc_f60 fs_16 lin_40">
<em class="fs_24 Numfont">7</em>%
<em class="fs_221 mar_l5 mar_r5">-</em>
<em class="fs_24 Numfont">9</em>%
<!--全局加息start-->
<!--定义参数-->
<!--全局加息end-->
</p>
<span class="fc_9">历史参考扣费后年化利率</span>
</li>
<li class="fl wid_w260">
<p class="mar_t5 fc_3 fs_16 lin_40"><em class="fs_18 fc_f60 Numfont mar_r5">321,108</em>元</p>
<span class="fc_9">剩余金额</span>
</li>
<li class="fl wid_w80">
<div class="biaolh"><span class="b_jingdu b_jd78">78%</span></div>
</li>
<li class="fr wid_w120">
<div class="biaolh pad_t10">
<span class="btn btn_size120 btn_bgf60">授权出借</span>
</div>
</li>
</ul>
</a>
</div>
'''
#产品名称,债券转让期限,利率范围,利率范围,剩余待满金额
nwd = re.compile('<div class="prolistMsg pad_t20 clearfix">.*?<li.*?<strong class="fs_18 fc_3 mar_r5">(.*?)</strong>.*?<span class="fc_9">(.*?)</span>.*?<em class="fs_24 Numfont">(.*?)</em>.*?<em class="fs_24 Numfont">(.*?)</em>.*?<p.*?class="fs_18 fc_f60 Numfont mar_r5">(.*?)</em>.*?',re.S)
url_nwd = 'https://member.niwodai.com/portal/inteBid/inteBidPage.do'
headers = {'User-Agent':
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/64.0.3282.186 Mobile Safari/537.36'}
response = requests.get(url_nwd, headers = headers)
html = response.text
print(html)
nwd_product = re.findall(nwd, html)
'''
nwd1 = nwd_1 = re.compile('<strong class="fs_18 fc_3 mar_r5">(.*?)</strong>', re.S)
nwd_2 =nwd_2 = re.compile('<p.*?<span class="fc_9">(.*?)</span>',re.S)
re.findall(nwd_2,ht)
re.findall(nwd_1,ht)
nwd_1 = re.findall(nwd, html)
nwd_1'''
from bs4 import BeautifulSoup as bs
htm = '''
<li class="fl wid_w260">
<p class="mar_t5 fc_f60 fs_16 lin_40">
<em class="fs_24 Numfont">7</em>%
<em class="fs_221 mar_l5 mar_r5">-</em>
<em class="fs_24 Numfont">9</em>%
<!--全局加息start-->
<!--定义参数-->
<!--全局加息end-->
'''
html = '''
<li class="fl wid_w260">
<p class="mar_t5 fc_f60 fs_16 lin_40">
<em class="fs_24 Numfont">7</em>%
<em class="fs_221 mar_l5 mar_r5">-</em>
<em class="fs_24 Numfont">9</em>%
<!--全局加息start-->
<!--定义参数-->
<!--全局加息end-->
</p>
<span class="fc_9">历史参考扣费后年化利率</span>
</li>
'''
soup = bs(htm,'lxml')
soup.prettify() #补全html代码,容错处理.将htm代码补全为html代码
soup.p #选择标签
soup.li
soup.em #如果有多个,则只返回第一个内容
soup.li.name #获取最外层的标签名称
soup.em.attrs['class'] #获取em标签里class 的值,只返回匹配的第一个。
soup.em['class'] #与上一条的运行结果是一致的。
soup.em.string #获取这个标签里面的内容
soup.p.em #嵌套选取,获取p里面的em标签里
############子节点,子孙节点.
soup.p.contents
for i, child in enumerate(soup.children):
print(i,child)
soup = bs(ht, 'lxml')
#选择class fl wid_w80
soup.select('li p')
soup = bs(ht, 'lxml')
soup.p
soup.select('li')
soup.select('p')
soup.select('strong')
type(soup.select('span'))
######################pyquery
from pyquery import PyQuery as pq
doc = pq(html) #doc此时是一个css选择器 # #号表示id=, . 表示class = , 什么都不叫表示 标签 =
print(doc('li')) #选择标签
print(doc('.fs_24.Numfont')) #选择class = 'fs_24' 并且class = 'Numfont' 不加空格表示并列条件
print(doc.find('p'))
#遍历的方法:,items()方法
lis = doc('em').items()
print(type(lis))
for i in lis: # 获取em标签,并把em标签中class属性的值打印出来
print(i)
print(i.attr('class')) #.attr()获取属性
print(i.text()) #获取文本内容
print(a.html()) #获取html
##########DOM操作 addclass, removeclass
#修改属性,
print(doc('.fs_221.mar_l5.mar_r5'))
a = doc('.fs_221.mar_l5.mar_r5')
a.attr('name', 'link') #在里面修该name属性,若name已经存在,则将link赋值给link, 若name不存在,则将name = link 属性添加进这个标签
print(a)
a.css('front-size', '14px' ) #添加style属性
print(a)
####remove移除标签
a = doc.find('em')
print(a)
type(a)
a.find('.fs_221.mar_l5.mar_r5').remove()
print(a.text())
a = doc('em')
print(a)
print(a.attr('class'))
type(doc('li'))
doc = pq(url = 'http://www.baidu.com')
print(doc('head'))
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
爬虫笔记
最新推荐文章于 2019-12-09 14:54:55 发布