import pandas as pd
from decimal import Decimal, getcontext, setcontext, InvalidOperation
from decimal import Context as DecimalContext
import numpy as np
from openpyxl import load_workbook
from scipy import stats
import math
def read_excel_to_dict(file_path, sheet_name, cell_range, column_names):
"""
读取Excel指定区域的多列数据,并返回命名数组,保留空单元格为NaN
参数:
file_path: Excel文件路径
sheet_name: 工作表名称
cell_range: 单元格范围 (如'C3:D27')
column_names: 列名称列表 (如['pettm', 'petf1'])
返回:
包含命名数组的字典
"""
# 解析单元格范围
start_cell, end_cell = cell_range.split(':')
start_col = start_cell[0]
end_col = end_cell[0]
start_row = int(start_cell[1:])
end_row = int(end_cell[1:])
# 计算列范围
all_columns = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
start_idx = all_columns.index(start_col)
end_idx = all_columns.index(end_col)
num_columns = end_idx - start_idx + 1
# 验证列名数量
if len(column_names) != num_columns:
raise ValueError(f"列名数量({len(column_names)})与列数({num_columns})不匹配")
# 读取数据为DataFrame,保留空值为NaN
# 读取数据为字符串保留精度
df = pd.read_excel(
file_path,
sheet_name=sheet_name,
header=None,
usecols=f"{start_col}:{end_col}",
skiprows=start_row - 1,
nrows=end_row - start_row + 1,
dtype=str
)
# 转换为Decimal/Nan数组
result = {}
for i, col_name in enumerate(column_names):
mixed_list = []
for val in df.iloc[:, i]:
# 处理空单元格(NaN或空字符串)
if pd.isna(val) or str(val).strip() == '':
mixed_list.append(np.nan)
# 处理Excel中的特殊空值表示
elif str(val).lower() in ['nan', 'n/a', 'na', 'null', '#n/a', '', ' ']:
mixed_list.append(np.nan)
# 尝试转换为Decimal
else:
try:
mixed_list.append(Decimal(str(val)))
except:
# 保留原始值作为后备
mixed_list.append(val)
result[col_name] = np.array(mixed_list, dtype=object)
return result
def z_score_normalize(data_dict, fill_nan=None):
"""
对数据字典中的每列进行Z-score标准化,保留NaN位置不变
参数:
data_dict: 包含数据的字典 {列名: np.array},数组包含 Decimal 和 NaN
返回:
标准化后的数据字典,保留原始 NaN 位置,数值为 Decimal 类型
"""
# 设置高精度计算环境
original_context = getcontext()
high_precision_context = original_context.copy()
high_precision_context.prec = 28 # 设置28位精度
setcontext(high_precision_context)
normalized_data = {}
for col_name, arr in data_dict.items():
# 步骤1: 提取有效 Decimal 值(排除 NaN)
decimal_values = []
indices = [] # 记录有效值的索引位置
for i, val in enumerate(arr):
# 保留 NaN 位置
if pd.isna(val):
continue
# 处理 Decimal 值
if isinstance(val, Decimal):
decimal_values.append(val)
indices.append(i)
# 处理其他数值类型(转换为 Decimal)
elif isinstance(val, (int, float)):
try:
decimal_values.append(Decimal(str(val)))
indices.append(i)
except (InvalidOperation, ValueError):
# 转换失败,保留为 NaN
pass
# 步骤2: 计算均值和标准差(Decimal 计算)
if len(decimal_values) > 0:
# 计算总和
total = sum(decimal_values, Decimal(0))
# 计算均值
mean_val = total / Decimal(len(decimal_values))
# 计算方差
variance = Decimal(0)
for val in decimal_values:
diff = val - mean_val
variance += diff * diff
variance /= Decimal(len(decimal_values))
# 计算标准差
if variance > 0:
std_dev = variance.sqrt()
else:
std_dev = Decimal(0)
else:
# 没有有效数据时使用默认值
mean_val = Decimal(0)
std_dev = Decimal(1)
# 步骤3: 创建结果数组(初始化全部为 NaN)
normalized_arr = np.empty(len(arr), dtype=object)
normalized_arr[:] = np.nan # 初始化为 NaN
# 步骤4: 计算 Z-score 并对有效值进行标准化
for idx, val in zip(indices, decimal_values):
# 处理标准差为零的情况
if std_dev == 0:
normalized_val = Decimal(0)
else:
normalized_val = (val - mean_val) / std_dev
normalized_arr[idx] = normalized_val
normalized_data[col_name] = normalized_arr
# 恢复原始计算精度环境
setcontext(original_context)
return normalized_data
def calculate_weighted_averages(normalized_data, group_config):
"""
计算多组列的加权平均值
参数:
normalized_data: 标准化数据字典 {列名: 数组}
group_config: 分组配置字典
返回:
包含X和Y数组的字典
"""
# 确定数据行数
first_col = next(iter(normalized_data.values()))
num_rows = len(first_col)
results = {}
for group_name, config in group_config.items():
columns = config['columns']
weights = config['weights']
# 验证配置
if len(columns) != len(weights):
raise ValueError(f"组 '{group_name}' 的列数与权重数不匹配")
# 验证权重和为1
if not np.isclose(sum(weights), 1.0, atol=1e-6):
print(f"警告: 组 '{group_name}' 的权重和({sum(weights)})不为1,将自动归一化")
weights = [w / sum(weights) for w in weights]
# 初始化结果列表
weighted_avgs = []
nan_count = 0
# 逐行处理
for row_idx in range(num_rows):
valid_values = [] # 存储有效值
valid_weights = [] # 存储有效权重
invalid_indices = [] # 记录无效值位置
# 收集当前行的有效值
for col_idx, col_name in enumerate(columns):
if col_name not in normalized_data:
raise KeyError(f"列名 '{col_name}' 在输入数据中不存在")
value = normalized_data[col_name][row_idx]
# 检查值是否为有效 Decimal
if pd.isna(value) or value is None:
invalid_indices.append(col_idx)
else:
try:
# 确保值是 Decimal
if not isinstance(value, Decimal):
value = Decimal(str(value))
valid_values.append(value)
valid_weights.append(weights[col_idx])
except (TypeError, InvalidOperation):
invalid_indices.append(col_idx)
# 处理全无效行
if not valid_values:
weighted_avgs.append(np.nan)
nan_count += 1
continue
# 计算有效权重的总和
total_valid_weight = sum(valid_weights, Decimal(0))
# 重新调整权重(归一化)
if total_valid_weight != Decimal(1):
adjusted_weights = [w / total_valid_weight for w in valid_weights]
else:
adjusted_weights = valid_weights
# 计算加权平均值(Decimal 精确计算)
weighted_sum = Decimal(0)
for val, weight in zip(valid_values, adjusted_weights):
weighted_sum += val * weight
weighted_avgs.append(weighted_sum)
# 存储结果
results[group_name] = weighted_avgs
# 打印处理统计
print(f"组 '{group_name}' 处理完成: "
f"总行数={num_rows}, "
f"有效行={num_rows - nan_count}, "
f"无效行={nan_count}")
return results
# 使用示例
if __name__ == "__main__":
FILE_PATH = "shuzhi.xlsx"
SHEET_NAME = "Sheet1"
CELL_RANGE = "C3:I27" # 读取7列数据
COLUMN_NAMES = ["pettm", "petf1", "pblf", "pbfy1", "ev/ebitda","股息率","peg"]
HANDLE_NON_NUMERIC = 'convert' # 处理非数值的策略
FILL_VALUE = np.nan # 填充值
# 1. 读取Excel数据
print(f"从 {FILE_PATH} 读取数据...")
data_dict = read_excel_to_dict(FILE_PATH, SHEET_NAME, CELL_RANGE, COLUMN_NAMES)
# 验证空值处理
print("\n空值验证:")
for col_name, arr in data_dict.items():
nan_count = sum(1 for x in arr if pd.isna(x))
print(f"{col_name}: 共 {nan_count} 个NaN值")
# 2. Z-score标准化处理
print("\n进行Z-score标准化...")
normalized_data = z_score_normalize(data_dict)
# 验证标准化结果
print("\n标准化结果空值验证:")
for col_name, arr in normalized_data.items():
nan_count = sum(1 for x in arr if pd.isna(x))
print(f"{col_name}: 共 {nan_count} 个NaN值")
print(normalized_data)
#3.加权平均1
# 分组配置
group_config = {
'X': {
'columns': ["pettm", "petf1", "revenue", "cost", "profit"], # BCD列的标准化数据
'weights': [0.2, 0.2, 0.2,0.2,0.2] # 加权权重
},
'Y': {
'columns': ["股息率","peg"], # EFG列的标准化数据
'weights': [0.5, 0.5] # 加权权重
}
}
# 计算加权平均值
print("计算加权平均值...")
results = calculate_weighted_averages(normalized_data, group_config)
print(results)
里面有什么错误导致没有结果
最新发布