爬取的网站'http://quote.eastmoney.com/stocklist.html'
'https://gupiao.baidu.com/stock/'
准备工作首先安装requests 库 和 Beautifulsoup两个库爬取的内容是股票的交易信息
一、所有的爬虫第一布请求连接
import requests
import re
from bs4 import BeautifulSoup
def getHtmlText(url):
'''
向网页发送请求,接受响应信息
:param url: 请求网址连接
:return: r.text返回的是页面的响应内容
'''
try:
r=requests.get(url)
#抓取连接的异常
r.raise_for_status()
#将抓取的信息转换成utf-8
r.encoding='utf-8'
return r.text
except:
return '连接异常'
二、我们要去处理我们爬取的内容进行处理
def getCaifuStockLis(StockCaifuurl,lst):
'''
通过财富网取得所有股票的交易码
:param StockCaifuurl:财富网连接
:param lst: 储存所有股票的代码
:return:
'''
#获得财富网的响应内容
html = getHtmlText(StockCaifuurl)
#解析财富网
soup = BeautifulSoup(html, 'html.parser')
#得到所有a标签
a = soup('a')
for items in a:
try:
#得到所有属性是href
href=items.attrs['href']
#通过正则匹配
lst.append(re.findall(r'[s][hz]\d{6}',href)[0])
except:
#有问题的直接跳过本次循环
continue
def getStockInfo(lit,stockUrl,fpath):
'''
:param lit: 股票代号
:param stockUrl: 拼接的连接头部
:param fpath:
:return:
'''
#进度条
count = 0
for i_url in lit:
#拼接连接
url=stockUrl+i_url+'.html'
html = getHtmlText(url)
try:
if html=='':
continue
infoDict={}
soup =BeautifulSoup(html, 'html.parser')
class_attrs = soup.find("div",attrs={'class':'stock-info'})
name =class_attrs.find_all(attrs={'class': 'bets-name'})[0]
#保存股票的名字
infoDict.update({"股票名":name.text.split()[0]})
keystock = class_attrs('dt')
valuestock = class_attrs('dd')
for i in range(len(keystock)):
key = keystock[i].text
val = valuestock[i].text
infoDict[key] = val
with open(fpath,'a',encoding="utf-8") as f:
f.write(str(infoDict)+'\n')
count = count + 1
print("\r当前进度: {:.2f}%".format(count * 100 / len(lit)), end="")
except:
print("\r异常当前进度: {:.2f}%".format(count * 100 / len(lit)), end="")
continue
def main():
lit=[]
stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
stock_info_url = 'https://gupiao.baidu.com/stock/'
output_file = 'D:/BaiduStockInfo.txt'
getCaifuStockLis(stock_list_url,lit)
getStockInfo(lit, stock_info_url, output_file)
main()