import requests
from bs4 import BeautifulSoup
将标准搜索关键词转化成GBK格式,并用%连接转化后16进制,转化成工标网的查询网址url
text = “GB/T 9755”
utf8_encoded_text = text.encode(‘GBK’)
#print(utf8_encoded_text)
hex_representation = ‘%’.join(hex(b)[2:] for b in utf8_encoded_text)
#print(hex_representation)
url = “http://www.csres.com/s.jsp?keyword=%”+hex_representation+“&pageSize=1000&pageNum=1”
print(url)
#使用BeautifulSoup解析工标网网站,需要设置请求头
设置请求头,模拟浏览器访问
headers = {
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36”
}
发送请求获取网页内容
response = requests.get(url, headers=headers)
html_content = response.text
使用BeautifulSoup解析网页
soup = BeautifulSoup(html_content, ‘html.parser’)
#print(soup)
找到对应的tr标签
tr_tags = soup.find_all(‘tr’, attrs={‘bgcolor’: ‘#FFFFFF’})
提取各个单元格的文本内容,以列表形式存储
data_list = []
for tr_tag in tr_tags:
data = []
td_tags = tr_tag.find_all(‘td’)
data.append(td_tags[0].text.strip())
data.append(td_tags[1].text.strip())
data.append(td_tags[2].text.strip())
data.append(td_tags[3].text.strip())
data.append(td_tags[4].text.strip())
data_list.append(data)
‘’’
提取各个单元格的文本内容,以字典形式存储
data_list = []
for tr_tag in tr_tags:
data = {}
td_tags = tr_tag.find_all(‘td’)
data[‘standard_number’] = td_tags[0].text.strip()
data[‘standard_name’] = td_tags[1].text.strip()
data[‘release_department’] = td_tags[2].text.strip()
data[‘implementation_date’] = td_tags[3].text.strip()
data[‘status’] = td_tags[4].text.strip()
data_list.append(data)
‘’’
for item in data_list:
print(item)