以前写的Python抓取股票资料的项目,因为效率太低后来没有用了,用python跑完一圈大概要几十分钟,没法使用,但是当python的demo还是不错的(后来用java写调用sino js函数的,25秒收集一次全部股票的资料并写入redis,基本够用了),这个demo用到的技能包括
数据库操作,定时任务,用BeautifulSoup爬虫,python的字典,正则等
数据库储存的情况:
项目结构
可以在这里下载代码
https://download.youkuaiyun.com/download/howard789/11222353
定时任务main方法
import datetime
import schedule
import threading
import time
from src.ScrapyStockTradeInfo import collectDailyTradeInfo
def job1():
print("job1 collectDailyTradeInfo start:", datetime.datetime.now())
collectDailyTradeInfo();
print("job1 collectDailyTradeInfo end:", datetime.datetime.now())
def job1_task():
threading.Thread(target=job1).start()
def run():
# 立即执行
if (1==1):
job1()
# 定时执行
if (1==2):
schedule.every(10).seconds.do(job1)
# 定时执行
if (1==2):
schedule.every().day.at("22:40").do(job1)
while True:
schedule.run_pending()
time.sleep(1)
run()
主方法
#!user/bin/env python3
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import time
import re
import traceback
from src import Method, dbUtil
# url,html,soap,div
from src.Method import getDicModel, saveStockInfo
from src.dbUtil import Insert
BB=","
BB2="\',\'"
def _getFileName(market='sh'):
name = time.strftime('%Y%m%d', time.localtime(time.time()))
return name + market
def _getStockListFromDatabase(stock_list):
sql='select stock_id_full from stock_list';
result=dbUtil.Select(sql)
for i in result:
stock_list.append(i[0]);
return stock_list;
def _getStockList(stock_list, stock_list_url):
html = getHtml(stock_list_url)
soup = BeautifulSoup(html, 'html.parser')
a = soup.find_all('a')
for i in a:
try:
href = i.attrs['href']
tmp = re.findall(r'sh6\d{5}', href)
if (len(tmp) > 0):
stock_list.append(tmp[0])
tmp2 = re.findall(r'sz\d{6}', href)
if (len(tmp2) > 0):
stock_list.append(tmp2[0])
except:
traceback.print_exc()
def _getStockInfo(stock_list, stock_info_url, outPutFilePath,tableName):
# print(now)
for stockNo in stock_list:
try:
url = stock_info_url + stockNo + '.html'
print(url)
html = getHtml(url)
infoDic = {}
if html == '':
continue
soap = BeautifulSoup(html, 'html.parser')
stock_info = soap.find('div', attrs={'class': 'stock-info'})
if (stock_info == None):
continue
keyList = stock_info.find_all('dt')
valList = stock_info.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text
value = valList[i].text.strip()
infoDic[key] = value
saveStockInfo(infoDic,stockNo,tableName)
print('写入'+stockNo)
except:
traceback.print_exc()
print('取得'+stockNo+'失败')
def getHtml(url):
try:
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
headers = {'User-Agent': user_agent}
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
traceback.print_exc()
return '';
def collectDailyTradeInfo():
print("start collectDailyTradeInfo")
stock_info_url = 'https://gupiao.baidu.com/stock/'
print('p1')
outPutFilePath = './' + _getFileName() + '.txt'
stock_list = []
# _getStockList(stock_list, stock_list_url)
# 从数据库取得股票列表
_getStockListFromDatabase(stock_list)
# 打印列表
print(stock_list)
# 创建用于存放数据的新表
tableName=Method.createTable();
# 下载解析数据,并将数据写入表中
_getStockInfo(stock_list, stock_info_url, outPutFilePath,tableName)
if __name__ == "__main__":
collectDailyTradeInfo()
数据库方法
import pymysql
def getDB():
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root", db="test", charset="utf8mb4")
return db
# 关闭数据库连接
def _Close(db, cursor):
cursor.close()
db.close()
def _executeSql(sql):
db = getDB()
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# SQL 删除语句
# sql = "DELETE FROM EMPLOYEE WHERE AGE > '%d'" % (20)
try:
# 执行SQL语句
cursor.execute(sql)
# 提交修改
db.commit()
except:
# 发生错误时回滚
db.rollback()
# 关闭连接
_Close(db,cursor)
def Select(sql):
db=getDB()
cursor = db.cursor()
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
_Close(db,cursor)
return results
def Count(sql):
db=getDB()
cursor = db.cursor()
# sql = "SELECT * FROM main"
cursor.execute(sql)
cursor.close()
db.close()
# 获取所有记录列表
results = cursor.fetchone()
return results
def createTable(tableName):
sql = "DROP TABLE IF EXISTS "+tableName;
_executeSql(sql)
sql=getTemplateCreateTableSql();
sql=sql.replace("scrapy_example",tableName)
_executeSql(sql)
# 增
def Insert(sql):
db=getDB()
cursor=db.cursor()
# 执行SQL语句
cursor.execute(sql)
# 没有设置默认自动提交,需要主动提交,以保存所执行的语句
db.commit()
_Close(db,cursor)
# 删
def Delect(db, cursor):
sql = "DELETE FROM main WHERE Name = '修改后的名字'"
cursor.execute(sql)
db.commit()
# 查
def Select(sql):
db=getDB()
cursor=db.cursor()
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
return results
# def insertStockInfo(sql):
def getTemplateCreateTableSql():
sql = '''
CREATE TABLE `scrapy_example` (
`stockId` varchar(32) NOT NULL,
`open` varchar(32) DEFAULT NULL COMMENT '今开',
`volumn` varchar(32) DEFAULT NULL COMMENT '成交量',
`high` varchar(32) DEFAULT NULL COMMENT '最高',
`limitUp` varchar(32) DEFAULT NULL,
`insidePlate` varchar(32) DEFAULT NULL COMMENT '内盘',
`turnover` varchar(32) DEFAULT NULL COMMENT '成交额',
`committee` varchar(32) DEFAULT NULL COMMENT '委比',
`liquidity` varchar(32) DEFAULT NULL COMMENT '流通市值',
`peMRQ` varchar(32) DEFAULT NULL COMMENT '市盈率MRQ',
`eps` varchar(32) DEFAULT NULL COMMENT '每股收益',
`equity` varchar(32) DEFAULT NULL COMMENT '总股本',
`previousClose` varchar(32) DEFAULT NULL COMMENT '昨收',
`turnoverRate` varchar(32) DEFAULT NULL COMMENT '换手率',
`low` varchar(32) NOT NULL COMMENT '最低',
`limitDown` varchar(32) DEFAULT NULL COMMENT '跌停',
`outsidePlate` varchar(32) DEFAULT NULL COMMENT '外盘',
`dayRange` varchar(32) DEFAULT NULL COMMENT '振幅',
`qrr` varchar(32) DEFAULT NULL COMMENT '量比quantity relative ratio',
`marketCap` varchar(32) DEFAULT NULL COMMENT '总市值',
`pb` varchar(32) DEFAULT NULL COMMENT '市净率',
`bps` varchar(32) DEFAULT NULL COMMENT '每股净资产',
`tradableShare` varchar(32) DEFAULT NULL COMMENT '流通股本',
`date` datetime NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;'''
return sql
def getTemplateCreateTableSql_backUp():
sql = '''
CREATE TABLE `scrapy_example` (
`stockId` varchar(10) NOT NULL,
`open` decimal(10,2) DEFAULT NULL COMMENT '今开',
`volumn` decimal(20,2) DEFAULT NULL COMMENT '成交量',
`high` decimal(10,2) DEFAULT NULL COMMENT '最高',
`limitUp` decimal(10,2) DEFAULT NULL,
`insidePlate` decimal(10,2) DEFAULT NULL COMMENT '内盘',
`turnover` decimal(20,2) DEFAULT NULL COMMENT '成交额',
`committee` decimal(10,2) DEFAULT NULL COMMENT '委比',
`liquidity` decimal(20,2) DEFAULT NULL COMMENT '流通市值',
`peMRQ` decimal(10,2) DEFAULT NULL COMMENT '市盈率MRQ',
`eps` decimal(10,2) DEFAULT NULL COMMENT '每股收益',
`equity` decimal(20,2) DEFAULT NULL COMMENT '总股本',
`previousClose` decimal(10,2) DEFAULT NULL COMMENT '昨收',
`turnoverRate` decimal(10,2) DEFAULT NULL COMMENT '换手率',
`low` decimal(10,2) NOT NULL COMMENT '最低',
`limitDown` decimal(10,2) DEFAULT NULL COMMENT '跌停',
`outsidePlate` decimal(10,2) DEFAULT NULL COMMENT '外盘',
`dayRange` decimal(10,2) DEFAULT NULL COMMENT '振幅',
`qrr` decimal(10,2) DEFAULT NULL COMMENT '量比quantity relative ratio',
`marketCap` decimal(20,2) DEFAULT NULL COMMENT '总市值',
`pb` decimal(10,2) DEFAULT NULL COMMENT '市净率',
`bps` double(10,2) DEFAULT NULL COMMENT '每股净资产',
`tradableShare` double(20,0) DEFAULT NULL COMMENT '流通股本',
`date` datetime NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;'''
return sql
字符串操作
import time
from src import dbUtil
from src.dbUtil import Insert
def _parseWanShou(str):
if(str.strip()=='--'):
return 0
# str=str.replace('万手','').strip()
if('万手' in str):
str=str.replace('万手','').strip()
if('手' in str):
str=str.replace('手','').strip()
return float(str)
def _parseYi(str):
if(str.strip()=='--'):
return 0
if('亿' in str):
str=str.replace('亿','').strip()
if('万' in str):
str=str.replace('万','').strip()
return float(str)
def _parsePercent(str):
if(str.strip()=='--'):
return 0
str=str.replace('%','')
return float(str)
def _parseNum(str):
if(str.strip()=='--'):
return 0
return float(str)
def getDicModel(dic,stockNo):
rtnDic={}
rtnDic['stockId']=stockNo
rtnDic['open']=_parseNum(dic['今开'])
rtnDic['volumn']=_parseWanShou(dic['成交量'])
rtnDic['high']=_parseNum(dic['最高'])
rtnDic['limitUp']=_parseNum(dic['涨停'])
rtnDic['insidePlate']=_parseWanShou(dic['内盘'])
rtnDic['turnover']=_parseYi(dic['成交额'])
rtnDic['committee']=_parsePercent(dic['委比'])
rtnDic['liquidity']=_parseYi(dic['流通市值'])
rtnDic['peMRQ']=_parseNum(dic['市盈率MRQ'])
rtnDic['eps']=_parseNum(dic['每股收益'])
rtnDic['equity']=_parseYi(dic['总股本'])
rtnDic['previousClose']=_parseNum(dic['昨收'])
rtnDic['turnoverRate']=_parsePercent(dic['换手率'])
rtnDic['low']=_parseNum(dic['最低'])
rtnDic['limitDown']=_parseNum(dic['跌停'])
rtnDic['outsidePlate']=_parseWanShou(dic['外盘'])
rtnDic['dayRange']=_parsePercent(dic['振幅'])
rtnDic['qrr']=_parseNum(dic['量比'])
rtnDic['marketCap']=_parseYi(dic['总市值'])
rtnDic['pb']=_parseNum(dic['市净率'])
rtnDic['bps']=_parseNum(dic['每股净资产'])
rtnDic['tradableShare']=_parseYi(dic['流通股本'])
rtnDic['datetime']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
return rtnDic
def _getSql(infoDic,tableName):
BB=","
sql="INSERT INTO "+ tableName+" VALUES (\'"+infoDic['stockId']+"\',"+str(infoDic['open'])+BB+str(infoDic['volumn'])+BB+ str(infoDic['high'])+BB+ str(infoDic['limitUp'])+BB+ str(infoDic['insidePlate'])+BB+ str(infoDic['turnover'])+BB+ str(infoDic['committee'])+BB+str(infoDic['liquidity'])+BB+ str(infoDic['peMRQ'])+BB+ str(infoDic['eps'])+BB+ str(infoDic['equity'])+BB+str(infoDic['previousClose'])+BB+ str(infoDic['turnoverRate'])+BB+ str(infoDic['low'])+BB+ str(infoDic['limitUp'])+BB+ str(infoDic['outsidePlate'])+BB+ str(infoDic['dayRange'])+BB+ str(infoDic['qrr'])+BB+ str(infoDic['marketCap'])+BB+ str(infoDic['pb'])+BB+ str(infoDic['bps'])+BB+ str(infoDic['tradableShare'])+",\'"+infoDic['datetime']+"\')";
return sql
# {'stockId': 'sz300059', 'open': 12.7, 'volumn': 164.58, 'high': 13.28,
# 'limitUp': 13.9, 'insidePlate': 77.37, 'turnover': 21.35, 'committee': 67.91,
# 'liquidity': 535.38, 'peMRQ': 55.32, 'eps': 0.07, 'equity': 51.68,
# 'previousClose': 12.64, 'turnoverRate': 3.96, 'low': 12.61,
# 'limitDown': 11.38, 'outsidePlate': 87.21, 'dayRange': 5.3,
# 'qrr': 3.82, 'marketCap': 666.18, 'pb': 4.02, 'bps': 3.21, 'tradableShare': 41.53,
# 'datetime': '2018-07-07 00:31:49'}
def saveStockInfo(dic,stockNo,tableName):
infoDic=getDicModel(dic,stockNo)
sql=_getSql(infoDic,tableName)
Insert(sql)
def createTable():
tableName=time.strftime('%Y%m%d', time.localtime(time.time()))
tableName='z_daily_trade_info_'+tableName
dbUtil.createTable(tableName);
return tableName
解析数据
import time
from src import dbUtil
from src.dbUtil import Insert
def _parseWanShou(str):
if(str.strip()=='--'):
return 0
# str=str.replace('万手','').strip()
if('万手' in str):
str=str.replace('万手','').strip()
if('手' in str):
str=str.replace('手','').strip()
return float(str)
def _parseYi(str):
if(str.strip()=='--'):
return 0
if('亿' in str):
str=str.replace('亿','').strip()
if('万' in str):
str=str.replace('万','').strip()
return float(str)
def _parsePercent(str):
if(str.strip()=='--'):
return 0
str=str.replace('%','')
return float(str)
def _parseNum(str):
if(str.strip()=='--'):
return 0
return float(str)
def getDicModel(dic,stockNo):
rtnDic={}
rtnDic['stockId']=stockNo
rtnDic['open']=_parseNum(dic['今开'])
rtnDic['volumn']=_parseWanShou(dic['成交量'])
rtnDic['high']=_parseNum(dic['最高'])
rtnDic['limitUp']=_parseNum(dic['涨停'])
rtnDic['insidePlate']=_parseWanShou(dic['内盘'])
rtnDic['turnover']=_parseYi(dic['成交额'])
rtnDic['committee']=_parsePercent(dic['委比'])
rtnDic['liquidity']=_parseYi(dic['流通市值'])
rtnDic['peMRQ']=_parseNum(dic['市盈率MRQ'])
rtnDic['eps']=_parseNum(dic['每股收益'])
rtnDic['equity']=_parseYi(dic['总股本'])
rtnDic['previousClose']=_parseNum(dic['昨收'])
rtnDic['turnoverRate']=_parsePercent(dic['换手率'])
rtnDic['low']=_parseNum(dic['最低'])
rtnDic['limitDown']=_parseNum(dic['跌停'])
rtnDic['outsidePlate']=_parseWanShou(dic['外盘'])
rtnDic['dayRange']=_parsePercent(dic['振幅'])
rtnDic['qrr']=_parseNum(dic['量比'])
rtnDic['marketCap']=_parseYi(dic['总市值'])
rtnDic['pb']=_parseNum(dic['市净率'])
rtnDic['bps']=_parseNum(dic['每股净资产'])
rtnDic['tradableShare']=_parseYi(dic['流通股本'])
rtnDic['datetime']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
return rtnDic
def _getSql(infoDic,tableName):
BB=","
sql="INSERT INTO "+ tableName+" VALUES (\'"+infoDic['stockId']+"\',"+str(infoDic['open'])+BB+str(infoDic['volumn'])+BB+ str(infoDic['high'])+BB+ str(infoDic['limitUp'])+BB+ str(infoDic['insidePlate'])+BB+ str(infoDic['turnover'])+BB+ str(infoDic['committee'])+BB+str(infoDic['liquidity'])+BB+ str(infoDic['peMRQ'])+BB+ str(infoDic['eps'])+BB+ str(infoDic['equity'])+BB+str(infoDic['previousClose'])+BB+ str(infoDic['turnoverRate'])+BB+ str(infoDic['low'])+BB+ str(infoDic['limitUp'])+BB+ str(infoDic['outsidePlate'])+BB+ str(infoDic['dayRange'])+BB+ str(infoDic['qrr'])+BB+ str(infoDic['marketCap'])+BB+ str(infoDic['pb'])+BB+ str(infoDic['bps'])+BB+ str(infoDic['tradableShare'])+",\'"+infoDic['datetime']+"\')";
return sql
# {'stockId': 'sz300059', 'open': 12.7, 'volumn': 164.58, 'high': 13.28,
# 'limitUp': 13.9, 'insidePlate': 77.37, 'turnover': 21.35, 'committee': 67.91,
# 'liquidity': 535.38, 'peMRQ': 55.32, 'eps': 0.07, 'equity': 51.68,
# 'previousClose': 12.64, 'turnoverRate': 3.96, 'low': 12.61,
# 'limitDown': 11.38, 'outsidePlate': 87.21, 'dayRange': 5.3,
# 'qrr': 3.82, 'marketCap': 666.18, 'pb': 4.02, 'bps': 3.21, 'tradableShare': 41.53,
# 'datetime': '2018-07-07 00:31:49'}
def saveStockInfo(dic,stockNo,tableName):
infoDic=getDicModel(dic,stockNo)
sql=_getSql(infoDic,tableName)
Insert(sql)
# '成交额': '21.35亿',turnover
# '委比': '67.91%',committee
# '流通市值': '535.38亿',liquidity
# '市盈率MRQ': '55.32',peMRQ
# '每股收益': '0.07',eps
# '总股本': '51.68亿',equity
# '昨收': '12.64', previousClose
# '换手率': '3.96%', turnoverRate
# '最低': '12.61', low
# '跌停': '11.38', limitDown
# '外盘': '87.21万手', outsidePlate
# '振幅': '5.30%', dayRange
# '量比': '3.82', qrr
# '总市值': '666.18亿', marketCap
# '市净率': '4.02', pb
# '每股净资产': '3.21', equityPS
# '流通股本': '41.53亿'tradableShare}
def createTable():
tableName=time.strftime('%Y%m%d', time.localtime(time.time()))
tableName='z_daily_trade_info_'+tableName
dbUtil.createTable(tableName);
return tableName
Stock List的 sql Demo
SET FOREIGN_KEY_CHECKS=0;
DROP TABLE IF EXISTS `stock_list`;
CREATE TABLE `stock_list` (
`stock_id_full` varchar(32) DEFAULT NULL,
`stock_id_number` varchar(32) DEFAULT NULL,
`market` varchar(32) DEFAULT NULL,
`stock_name` varchar(32) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO `stock_list` VALUES ('sh600000', '600000', 'sh', '浦发银行');
INSERT INTO `stock_list` VALUES ('sh600001', '600001', 'sh', '邯郸钢铁');
INSERT INTO `stock_list` VALUES ('sh600002', '600002', 'sh', '齐鲁石化');
INSERT INTO `stock_list` VALUES ('sh600003', '600003', 'sh', 'ST东北高');
INSERT INTO `stock_list` VALUES ('sh600004', '600004', 'sh', '白云机场');
INSERT INTO `stock_list` VALUES ('sh600005', '600005', 'sh', '武钢股份');
INSERT INTO `stock_list` VALUES ('sh600006', '600006', 'sh', '东风汽车');