import ipdb
import pandas as pd
import os
import json
from concurrent.futures import ThreadPoolExecutor
import difflib
# 加载本地 IP 数据库
try:
reader = ipdb.City(r'C:\Users\DELL\PycharmProjects\PythonProject1\ipipfree.ipdb')
except Exception as e:
raise FileNotFoundError("未找到本地 IP 数据库文件 'ipipfree.ipdb',请先下载并放置在当前目录")
# 缓存文件路径
CACHE_FILE = 'ip_cache.json'
# 加载缓存
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, 'r', encoding='utf-8') as f:
ip_cache = json.load(f)
else:
ip_cache = {}
# 读取CSV文件
csv_path = r'C:\Users\DELL\Desktop\中科院自动化所实习培训课程\数据分析实习项目\数据分析实习项目\GAD7.csv'
try:
data_clean = pd.read_csv(csv_path)
except FileNotFoundError:
raise FileNotFoundError(f"未找到CSV文件:{csv_path}")
# 删除无用列
columns_to_drop = ['partner_openid', 'partner_id', 'survey_id', '110']
data_clean = data_clean.drop(columns=[col for col in columns_to_drop if col in data_clean.columns])
# 时间戳转换函数
def convert_timestamp(col):
col = pd.to_numeric(col, errors='coerce').fillna(0).astype(int)
col = pd.to_datetime(col, unit='ms', errors='coerce')
return col.dt.strftime('%Y%m')
# 修改列名并转换时间戳
if '21' in data_clean.columns:
data_clean['21'] = convert_timestamp(data_clean['21'])
else:
print("列 '21' 不存在,请检查原始数据列名")
possible_matches = difflib.get_close_matches('21', data_clean.columns.tolist(), n=1, cutoff=0.6)
if possible_matches:
col_name = possible_matches[0]
print(f"使用最接近的列名 '{col_name}' 替代 '21'")
data_clean[col_name] = convert_timestamp(data_clean[col_name])
else:
print("没有找到与 '21' 接近的列名")
# 提取 scores 列中的数字(格式为 [x]),忽略缺失值所在行
if 'scores' in data_clean.columns:
# 提取方括号中的数字
extracted = data_clean['scores'].str.extract(r'$$(\d+)$$', expand=False)
# 合并到原数据中
data_clean['scores'] = extracted
# 忽略缺失值所在行
data_clean = data_clean[data_clean['scores'].notna()].reset_index(drop=True)
# 转换为整数类型
data_clean['scores'] = data_clean['scores'].astype(int)
# 对小于100的值加100
data_clean['scores'] = data_clean['scores'].apply(lambda x: x + 100 if x < 100 else x)
else:
print("列 'scores' 不存在")
# 提取IP地址
if 'from_ip' in data_clean.columns:
data_clean['ip'] = data_clean['from_ip'].str.split(',').str[0].str.strip()
else:
print("列 'from_ip' 不存在")
# 获取IP地理位置信息(本地数据库)
def get_geo_info(ip):
if ip in ip_cache:
print(f"从缓存中获取 IP: {ip}")
return ip_cache[ip]
try:
info = reader.find(ip, "CN")
if info and len(info) >= 3:
result = {
'ip': ip,
'country': info[0],
'province': info[1],
'city': info[2]
}
else:
result = {'ip': ip, 'country': 'Unknown', 'province': 'Unknown', 'city': 'Unknown'}
except Exception as e:
print(f"Error fetching {ip}: {e}")
result = {'ip': ip, 'country': 'Unknown', 'province': 'Unknown', 'city': 'Unknown'}
ip_cache[ip] = result # 仅更新缓存,不写入文件
return result
# 分批并发查询函数
def batch_ip_lookup(ip_list, batch_size=500):
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
for i in range(0, len(ip_list), batch_size):
print(f"Processing batch from {i} to {min(i + batch_size, len(ip_list))}")
batch = ip_list[i:i + batch_size]
batch_results = list(executor.map(get_geo_info, batch))
results.extend(batch_results)
return pd.DataFrame(results)
# 查询 IP 地理信息并合并回 data_clean
if 'ip' in data_clean.columns:
unique_ips = data_clean['ip'].dropna().unique().tolist()
geo_df = batch_ip_lookup(unique_ips, batch_size=500)
data_clean = pd.merge(data_clean, geo_df, on='ip', how='left')
else:
print("列 'ip' 不存在,跳过IP地理信息查询")
# 多选列 one-hot 编码配置
multiselect_cols = {
'500': 5,
'510': 4,
'520': 3,
'530': 5,
'540': 4,
'560': 3,
'580': 5
}
# one-hot 编码函数
def process_multiselect_column(df, column_name, num_options):
binary_columns = pd.DataFrame(index=df.index)
for i in range(num_options):
new_col = f"{column_name}-{i}"
binary_columns[new_col] = df[column_name].astype(str).str.contains(f",{i}|^{i},|^{i}$", regex=True).astype(int)
return binary_columns
# 过滤 user_id(保留含字母的行)
if 'user_id' in data_clean.columns:
data_clean = data_clean[data_clean['user_id'].str.contains(r'[A-Za-z]', na=False)]
else:
print("列 'user_id' 不存在")
# 初始化 one-hot 编码结果
binary_columns_all = pd.DataFrame(index=data_clean.index)
# 处理每个多选列
for col, num in multiselect_cols.items():
if col in data_clean.columns:
temp_df = process_multiselect_column(data_clean, col, num)
binary_columns_all = binary_columns_all.join(temp_df, how='left')
else:
print(f"列 '{col}' 不存在,跳过处理")
# 合并编码列回原始数据
data_clean = pd.concat([data_clean, binary_columns_all], axis=1)
# 输出结果
print("数据处理完成,前几行如下:")
print(data_clean.head())
print("数据维度:", data_clean.shape)
print("列类型:")
print(data_clean.dtypes)
# 保存处理后的数据到新文件
output_path = r'C:\Users\DELL\Desktop\中科院自动化所实习培训课程\数据分析实习项目\数据分析实习项目\GAD7_cleaned.csv'
data_clean.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"数据已保存至:{output_path}")
# 最后统一写入缓存文件
with open(CACHE_FILE, 'w', encoding='utf-8') as f:
json.dump(ip_cache, f, ensure_ascii=False, indent=4)
最新发布