利用的是XlsbWriter包。
from pyxlsbwriter import XlsbWriter
import datetime
from decimal import Decimal
import random
from datetime import datetime, timedelta
def generate_stock_data(date, num_rows=10000):
"""
生成指定日期的模拟股市行情数据
Args:
date: 日期 (datetime对象)
num_rows: 数据行数
Returns:
list: 包含模拟股市数据的二维列表
"""
# 生成股票代码列表
stock_codes = [f"STK{str(i).zfill(5)}" for i in range(1, 1001)]
# 表头
headers = ["股票代码", "时间", "开盘价", "最高价", "最低价", "收盘价",
"前收盘价", "涨跌额", "涨跌幅(%)", "成交量(股)", "成交额(元)", "换手率(%)"]
data = [headers]
for i in range(num_rows):
stock_code = random.choice(stock_codes)
# 生成时间 (从9:30到15:00)
hour = random.randint(9, 14)
minute = random.randint(0, 59)
if hour == 9:
minute = random.randint(30, 59)
elif hour == 14:
minute = random.randint(0, 57)
timestamp = datetime(date.year, date.month, date.day, hour, minute,
random.randint(0, 59))
# 生成价格数据
base_price = random.uniform(5, 500)
open_price = Decimal(str(round(base_price, 2)))
high_price = Decimal(str(round(float(open_price) * random.uniform(1.0, 1.1), 2)))
low_price = Decimal(str(round(float(open_price) * random.uniform(0.9, 1.0), 2)))
close_price = Decimal(str(round(random.uniform(float(low_price), float(high_price)), 2)))
# 计算涨跌幅
prev_close = Decimal(str(round(float(open_price) * random.uniform(0.95, 1.05), 2)))
change = Decimal(str(round(float(close_price) - float(prev_close), 2)))
change_percent = Decimal(str(round((float(change) / float(prev_close)) * 100, 2)))
# 生成成交量数据
volume = random.randint(10000, 10000000)
amount = Decimal(str(round(volume * float(close_price), 2)))
# 换手率
turnover_rate = Decimal(str(round(random.uniform(0.1, 15.0), 2)))
row = [
stock_code,
timestamp,
open_price,
high_price,
low_price,
close_price,
prev_close,
change,
change_percent,
volume,
amount,
turnover_rate
]
data.append(row)
# 按时间排序 (跳过表头)
data[1:] = sorted(data[1:], key=lambda x: x[1])
return data
def create_daily_stock_xlsb(filename, start_date, num_days=30, rows_per_sheet=10000, compression_level=6):
"""
创建包含多个sheet的xlsb文件,每个sheet代表一天的股市数据
Args:
filename: 输出文件名
start_date: 开始日期 (YYYY-MM-DD格式字符串或datetime对象)
num_days: 生成的天数
rows_per_sheet: 每个sheet的数据行数
compression_level: 压缩级别 (0-9, 0=无压缩, 9=最大压缩)
"""
if isinstance(start_date, str):
start_date = datetime.strptime(start_date, '%Y-%m-%d')
# 初始化XlsbWriter
with XlsbWriter(filename, compressionLevel=compression_level) as writer:
for day in range(num_days):
current_date = start_date + timedelta(days=day)
sheet_name = current_date.strftime('%Y-%m-%d')
print(f"正在生成 {sheet_name} 的数据...")
# 生成当天的数据
data = generate_stock_data(current_date, rows_per_sheet)
# 添加sheet并写入数据
writer.add_sheet(sheet_name)
writer.write_sheet(data)
# 显示进度
if (day + 1) % 5 == 0 or (day + 1) == num_days:
print(f"已完成 {day + 1}/{num_days} 天")
print(f"\n文件 '{filename}' 生成完成!")
print(f"共生成 {num_days} 天的数据,每个sheet包含 {rows_per_sheet} 行记录")
def main():
"""主函数"""
# 配置参数
output_filename = "daily_stock_data180d.xlsb"
start_date = "2024-01-01"
num_days = 180
rows_per_sheet = 10000
compression_level = 6 # 中等压缩级别
print("开始生成模拟股市行情数据 (XLSB格式)...")
print(f"文件名: {output_filename}")
print(f"开始日期: {start_date}")
print(f"生成天数: {num_days}")
print(f"每页行数: {rows_per_sheet}")
print(f"压缩级别: {compression_level}")
print("-" * 50)
try:
create_daily_stock_xlsb(
filename=output_filename,
start_date=start_date,
num_days=num_days,
rows_per_sheet=rows_per_sheet,
compression_level=compression_level
)
except Exception as e:
print(f"生成文件时出错: {e}")
if __name__ == "__main__":
main()
测试
root@6ae32a5ffcde:/par# ./duckdb141 -unsigned -cmd "load '/par/10/rusty_sheet.duckdb_extension';"
DuckDB v1.4.1 (Andium) b390a7c376
Enter ".help" for usage hints.
D .timer on
D copy (from read_sheets('daily_stock_data.xlsx')) to 'daily_stock_data.csv';
Run Time (s): real 5.373 user 3.434030 sys 0.648087
D copy (from read_sheets('daily_stock_data.xlsb')) to 'daily_stock_datab.csv';
Run Time (s): real 2.938 user 1.433324 sys 0.409136
D copy (from read_sheets('daily_stock_data180d.xlsb')) to 'daily_stock_datab180d.csv';
100% ▕██████████████████████████████████████▏ (00:00:02.96 elapsed)
Run Time (s): real 8.582 user 4.260902 sys 1.254622
D create table t as from read_sheets('daily_stock_data180d.xlsb');
Run Time (s): real 7.068 user 2.954096 sys 1.438975
D from t order by random() limit 3;
┌──────────┬─────────────────────┬────────┬────────┬────────┬───┬───────────┬────────────┬───────────────┬───────────┐
│ 股票代码 │ 时间 │ 开盘价 │ 最高价 │ 最低价 │ … │ 涨跌幅(%) │ 成交量(股) │ 成交额(元) │ 换手率(%) │
│ varchar │ timestamp │ double │ double │ double │ │ double │ int64 │ double │ double │
├──────────┼─────────────────────┼────────┼────────┼────────┼───┼───────────┼────────────┼───────────────┼───────────┤
│ STK00338 │ 2024-01-07 12:22:19 │ 282.57 │ 305.73 │ 257.53 │ … │ 0.57 │ 9435672 │ 2705018448.96 │ 0.86 │
│ STK00152 │ 2024-03-26 09:59:47 │ 233.64 │ 247.02 │ 232.8 │ … │ 0.46 │ 1870840 │ 457495213.6 │ 4.57 │
│ STK00334 │ 2024-01-03 13:31:45 │ 355.59 │ 374.35 │ 346.6 │ … │ -2.59 │ 5161124 │ 1792200309.0 │ 3.76 │
├──────────┴─────────────────────┴────────┴────────┴────────┴───┴───────────┴────────────┴───────────────┴───────────┤
│ 3 rows 12 columns (9 shown) │
└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
可见,xlsb格式读取基本上是xlsx的60%, 读取时间基本上是线性的。


被折叠的 条评论
为什么被折叠?



