一、HTML解析:Beautifulsoup
1、import
from bs4 import BeautifulSoup
2、初始化
soup=BeautifulSoup(HTMLText,"html.parser")
3、find和find_all
#find_all返回一个bs4.element.Tag对象数组
#find返回bs4.element.Tag对象。
#查找所有div标签
divs=soup.find_all("div")
#查找某个attr(如id)
x=soup.find_all(attrs={"id":"myId"})[0]
x=soup.find(attrs={"id":"myId"})[0]
4、获取属性值或者innerHTML
#属性值读取
attrVal=soup.get("attrName")
#InnerHtml读取
text=soup.get_text()
二、HTML转移符处理:html.parser
import requests
import sys
import os
from bs4 import BeautifulSoup
import urllib
from urllib import parse,request
import html
data="""
<td class="text"><input type='hidden' name='637_content' id='637_content' value='<p>一、填空题</p><p>1、D 2、C 3、A 4、B 5、1001 6、计算1+3+…+97+99,即100以内的奇数和,执行后的结果是S=2500</p><p>二、编程题</p><p><a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDHDBCPDECNDDCOGGHCGN">4-3.frm</a> <a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDHDCCPDECNDDCOHGGCHA">4-3.vbp</a> <a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDGDHCPDEFPDCDACOGGHCGN">4_20.frm</a> <a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDGDICPDEFPDCDACOHGGCHA">4_20.vbp</a> <a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDGDJCPDEFPDCDBCOGGHCGN">4_21.frm</a> <a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDHDACPDEFPDCDBCOHGGCHA">4_21.vbp</a> <a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDHDDCPDEFPDCDCCOGGHCGN">4_22.frm</a> <a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDHDECPDEFPDCDCCOHGGCHA">4_22.vbp</a> </p><p>三、应用题</p><p><a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDHDFCPDEFPDDDBCOGGHCGN">4_31.frm</a> <a href="/meol/common/ckeditor/openfile.jsp?id=DBCPDDDGDFDDDHDGCPDEFPDDDBCOHGGCHA">4_31.vbp</a></p><p> </p>'><iframe id='_rtf_content637' allowTransparency='true' name='_rtf_content637' src='/meol/common/ckeditor/content.html?name=637' width='100%' height='100%' frameborder='0' scrolling='no' marginheight='0'></iframe> </td>
"""
table=BeautifulSoup(data,"html.parser")
#读取attr
inp=table.find("input").get("value")
#读取innerHTML
#table.get_text()
print(inp)
print(html.parser.unescape(inp))
#print(inp)
三、HTTP POST/GET参数GBK编码Requests接受的参数默认utf-8编码,但是经常遇到需要gbk编码的网站,自己写了一个。
def urlencode_gbk(kv):
rst=""
for key in kv:
if len(rst)>0:
rst+="&"
#key
bStream=bytes(key,encoding="gbk")
for ch in bStream:
rst+="%%%02X" %(ch)
#val
rst+="="
val=kv[key]
bStream=bytes(val,encoding="gbk")
for ch in bStream:
rst+="%%%02X" %(ch)
return rst