2016/02/27 codes

本文介绍Box2D物理引擎中的碰撞检测与对碰管理实现细节,包括绝对值计算、范围判断、 AABB边界有效性验证等核心数学工具函数,并展示了如何通过哈希表进行对碰对的有效管理。

b2Math.b2Abs = function(a){return a > 0.0 ? a : -a ;};
b2Math.b2AbsV = function(a){var b = new b2Vec2(b2Math.b2Abs(a.x),b2Math.b2Abs(a.y));return b};
b2Math.b2AbsM=function(A){var B=new b2Mat22(0,b2Math.b2AbsV(A.col1),b2Math.b2AbsV(A.col2));return B;};
b2Math.b2Min = function(a,b){return a > b ? a : b;};
b2Math.b2MinV = function(a,b){var c = new b2Vec2(b2Math.b2Min(a.x, b.x),b2Math.b2Min(a.y, b.y));return c;};
b2Math.b2Max = function(a,b){return a > b ? a : b;};
b2Math.b2MaxV = function(a,b){var c = new b2Vec2(b2Math.b2Max(a.x, b.x),b2Math.b2Max(a.y, b.y));return c;};
b2Math.b2Clamp = function(a,low,high){return b2Math.b2Max(low,b2Math.b2Min(a,high));};
b2Math.b2ClampV = function(a,low,high){return b2Math.b2MaxV(low,b2Math.b2MinV(a,high));};
b2Math.b2Swap = function(a,b){var tmp = a[0];a[0] = b[0];b[0] = tmp;};
b2Math.b2Random = function(){return Math.random() * 2 - 1;};
b2Math.b2NextPowerOfTwo = function(x){x|= (x >> 1) & 0x7FFFFFFF;x|= (x >> 2) & 0x3FFFFFFF;x|= (x >> 4) & 0x0FFFFFFF;x|= (x >> 8) &0x00FFFFFF;x|= (x >> 16) & 0x0000FFFF;};
b2Math.b2IsPowerOfTwo = function(x){var result = x > 0 && (x&(x - 1)) == 0;return result;};
b2Math.tempVec2 = new b2Vec2();
b2Math.tempVec3 = new b2Vec2();
b2Math.tempVec4 = new b2Vec2();
b2Math.tempVec5 = new b2Vec2();
b2Math.tempMat = new b2Matt22();
var b2AABB=Class.create();
b2AABB.prototype = {IsValid:function(){
var dX = this.maxVertex.x;
var dY = this.maxVertex.y;
dX = this.maxVertex.x;
dy = this.maxVertex.y;
dX -= this.minVertex.x;
dY -= this.minVertex.y;
var valid = dX > 0.0 && dY > 0.0;
valid = valid && this.minVertex.IsValid() && this.maxVertex.IsValid();
return valid;
},
minVertex :new b2Vec2(),maxVertex : new b2Vec2();
initialize:function(){this.minVertex = new b2Vec2();this.maxVertex = new b2Vec2();}
};
var b2Bound = Class.create();
b2Bound.prototype = {IsLower: function () {
return (this.value & 1) == 1
},
var tempProxyId = this.proxyId;
var tempStabbingCount = this.stabbingCount;
this.value = b.value;
this.proxyId = b.proxyId;
this.stabbingCount = b.stabbingCount;
b.value = tempValue;
b.proxyId = tempProxyId;
b.stabbingCount = tempStabbingCount;
},
proxyId:0,value:0,stabbingCount:0,initialize:function(){}
}
var b2BoundValues = Class.create();
b2BoundValues.prototype = {lowerValues:[0,0],upperValues:[0,0],
initialize:function(){this.lowerValues = [0,0];this.upperValues = [0,0];}}
var b2Pair =Class.create();
b2Pair.prototype = {
SetBuffered:function(){this.status|= b2Pair.e_pairBuffered;},
ClearBuffered:function(){this.status &=~b2Pair.e_pairBuffered;},
IsBuffered:function(){return (this.status & b2Pair.e_pairBuffered) == b2Pair.e_pairBuffered;},
SetRemoved:function(){this.status |= b2Pair.e_pairRemoved;},
ClearRemoved:function(){this.status &=~ b2Pair.e_pairRemoved;},
IsRemoved:function(){return (this.status & b2Pair.e_pairRemoved) == b2Pair.e_pairRemoved;},
SetFinal:function(){this.status |= b2Pair.e_pairFinal ;},
IsFinal:function(){return (this.status & b2Pair.e_pairFinal) == b2Pair.e_pairFinal;},
userData:0,proxyId1:0,proxyId2:0,next:0,status:0,
initialize:function(){}
};
b2Pair.b2_nullPair = b2Settins.USHRT_MAX;
b2Pair.b2_nullProxy = b2Settins.USHRT_MAX;
b2Pair.b2_tableCapacity = b2Settins.b2_maxPairs;
b2Pair.b2_tableMask = b2Pair.b2_tableCapacity - 1;
b2Pair.e_pairBuffered = 0x0001;
b2Pair.e_pairRemoved = 0x0002;
b2Pair.e_pairFinal = 0x0004;
var b2PairCallback = Class.create();
b2PairCallback.prototype = function(){
PairAdded:function(proxyUserData1,proxyUserData2){return null},
PairRemoved:function(pairUserData1,pairUserData2,pairUserData){},
initialize:function(){}
};
var b2BufferedPair = Class.create();
b2BufferedPair.protype = {proxyId1:0,proxyId2:0,initialize:function{}}

var b2PairManger = Class.create();
b2PairManger.prototype = {
initialize:function(){var i = 0;this.m_hashTable = new Array(b2Pair.b2_tableCapacity);
for( i = 0;i < b2Pair.b2_tableCapacity;++i){
this.m_hashTable[i] = b2Pair.b2_nullPair;
}
this.m_pairs = new Array(b2Settins.b2_maxPairs);
for( i = 0;i < b2Settins.b2_maxPairs;++i){this.m_pairs[i] = new b2Pair();}
this.m_pairBuffer = new Array(b2Settins.b2_maxPairs);
for(i = 0;i < b2Settins.b2_maxPairs;++i){this.m_pairBuffer[i] = new b2BufferedPair();}
for(i = 0;i < b2Settins.b2_maxPairs;++i){
this.m_pairs[i].proxyId1 = b2Pair.b2_nullPair;
this.m_pairs[i].proxyId2 = b2Pair.b2_nullPair;
this.m_pairs[i].Usetdata = null;
this.m_pairs[i].status = 0;
this.m_pairs[i].next = (i + 1);
this.m_pairs[b2Settins.b2_maxPairs - 1].next = b2Pair.b2_nullPair;
this.m_pairCount = 0;
},
initialize:function(broadPhase,callback){this.m_broadPhase = broadPhase;this.m_callback = callback},
AddBufferPair:function(proxyId1,proxyId2){
var Pair = this.AddPair(proxyId1,proxyId2);
if(pair.IsBuffered() == false){
pair.SetBuffered ();
this.m_pairBuffer[this.m_pairBufferCount].proxyId1 = pair.proxyId1;
this.m_pairBuffer[this.m_pairBufferCount].proxyId2=pair.proxyId2;++this.m_pairBufferCount;}
pair.ClearRemoved();
if(b2BroadPhase.s_validate){
this.validateBuffer();
}
},
RemovedBufferPair:function(proxyId2,proxyId2){
if(pair == null){return;}
if(pair.IsBuffered() == false ){
pair.SetBuffered();
this.m_pairBuffer[this.m_pairBufferCount].proxyId1 = pair.proxyId1;
this.m_pairBuffer[this.m_pairBufferCount].proxyId2 = pair.proxyId2;
++this.m_pairBufferCount;
}
pair.SetRemoved();
if(pair.IsBuffered () == false){
pair.SetBuffered();
this.m_pairBuffer[this.m_pairBufferCount].proxyId1 = pair.proxyId1;
this.m_pairBuffer[this.m_pairBufferCount].proxyId2 = pair.proxyId2;
}
}
}
}
}

转载于:https://www.cnblogs.com/whatcanido/p/5223707.html

待调整涨跌停标记后完成高质量数据加载及异常数据标记 # -*- coding: utf-8 -*- import jqdata import pandas as pd import numpy as np import logging from jqdata import * from datetime import datetime from scipy.stats import iqr from sklearn.ensemble import IsolationForest import arch from scipy.stats import gaussian_kde logging.basicConfig(level=logging.INFO, format=&#39;%(asctime)s - %(levelname)s - %(message)s&#39;) logger = logging.getLogger(__name__) class StockDataLoader: def __init__(self): self.market_cap_data = {} self.special_events = self._create_special_events_db() self.security_info_cache = {} def _create_special_events_db(self): events = { &#39;circuit_breaker&#39;: [&#39;2016-01-04&#39;, &#39;2016-01-07&#39;], &#39;major_events&#39;: [ &#39;2015-06-12&#39;, &#39;2015-07-08&#39;, &#39;2016-01-04&#39;, &#39;2016-01-07&#39;, &#39;2018-03-23&#39;, &#39;2019-05-06&#39;, &#39;2020-01-23&#39;, &#39;2020-02-03&#39;, &#39;2020-03-09&#39;, &#39;2020-03-12&#39;, &#39;2020-03-16&#39;, &#39;2020-03-18&#39;, &#39;2020-07-06&#39;, &#39;2021-01-04&#39;, &#39;2021-02-18&#39;, &#39;2021-07-26&#39;, &#39;2022-02-24&#39;, &#39;2022-03-07&#39;, &#39;2022-03-08&#39;, &#39;2022-03-09&#39;, &#39;2022-04-25&#39;, &#39;2022-10-24&#39;, &#39;2022-10-28&#39;, &#39;2023-01-30&#39; ], &#39;black_swan&#39;: [ &#39;2015-06-12&#39;, &#39;2018-03-23&#39;, &#39;2020-01-23&#39;, &#39;2020-03-09&#39;, &#39;2021-07-26&#39;, &#39;2022-02-24&#39; ], &#39;extreme_market&#39;: [ &#39;2015-06-19&#39;, &#39;2015-06-26&#39;, &#39;2015-07-03&#39;, &#39;2015-07-09&#39;, &#39;2015-08-24&#39;, &#39;2015-08-25&#39;, &#39;2016-01-04&#39;, &#39;2016-01-07&#39;, &#39;2018-10-11&#39;, &#39;2019-05-06&#39;, &#39;2020-02-03&#39;, &#39;2020-07-06&#39;, &#39;2020-07-16&#39;, &#39;2021-02-18&#39;, &#39;2021-07-27&#39;, &#39;2022-03-15&#39;, &#39;2022-04-25&#39;, &#39;2022-10-24&#39; ], &#39;gem_reform&#39;: [ &#39;2020-08-24&#39;, &#39;2020-08-27&#39;, &#39;2020-09-09&#39;, &#39;2021-02-18&#39;, &#39;2021-04-19&#39;, &#39;2021-08-24&#39; ] } events_date = {} for key, date_list in events.items(): events_date[key] = [pd.Timestamp(date) for date in date_list] return events_date def get_security_info_cached(self, stock_code): if stock_code not in self.security_info_cache: self.security_info_cache[stock_code] = get_security_info(stock_code) return self.security_info_cache[stock_code] def is_special_event(self, date, event_type): return date in self.special_events.get(event_type, []) def get_all_stocks(self): all_stocks = get_all_securities(types=[&#39;stock&#39;], date=&#39;2012-12-31&#39;).index.tolist() stocks = [] for stock in all_stocks: stock_info = get_security_info(stock) if stock_info: start_date = pd.Timestamp(stock_info.start_date) end_date = pd.Timestamp(stock_info.end_date) if stock_info.end_date else None if start_date < pd.Timestamp(&#39;2013-01-01&#39;): if end_date is None or end_date > pd.Timestamp(&#39;2023-12-31&#39;): stocks.append(stock) else: logger.debug(f"排除退市股票: {stock}, 退市日期: {stock_info.end_date}") return stocks def filter_st_stocks(self, stocks, start_date, end_date): logger.info(f"开始过滤ST股票 ({len(stocks)}只)") non_st_stocks = [] batch_size = 100 trade_days = get_trade_days(start_date=start_date, end_date=end_date) for i in range(0, len(stocks), batch_size): batch = stocks[i:i+batch_size] logger.info(f"正在检查ST状态批次 {i//batch_size+1}/{(len(stocks)-1)//batch_size+1}") for stock in batch: try: st_status = get_extras(&#39;is_st&#39;, stock, start_date=start_date, end_date=end_date) if st_status is None or st_status.dropna().empty: non_st_stocks.append(stock) else: if st_status.iloc[:, 0].sum() == 0: non_st_stocks.append(stock) else: st_dates = st_status[st_status.iloc[:, 0]].index.tolist() logger.debug(f"过滤ST股票: {stock}, ST日期: {st_dates}") except Exception as e: logger.error(f"检查{stock}的ST状态失败: {str(e)}") non_st_stocks.append(stock) logger.info(f"过滤后剩余股票数量: {len(non_st_stocks)}") return non_st_stocks def get_market_cap(self, stock, date=&#39;2012-12-31&#39;): if stock not in self.market_cap_data: try: q = query(valuation).filter(valuation.code == stock) df = get_fundamentals(q, date=date) if not df.empty: self.market_cap_data[stock] = df[&#39;market_cap&#39;].iloc[0] else: self.market_cap_data[stock] = np.nan except Exception as e: logger.warning(f"获取{stock}市值失败: {str(e)}") self.market_cap_data[stock] = np.nan return self.market_cap_data[stock] def categorize_stocks(self, stocks): market_caps = [] for stock in stocks: cap = self.get_market_cap(stock) if not np.isnan(cap): market_caps.append((stock, cap)) sorted_stocks = sorted(market_caps, key=lambda x: x[1], reverse=True) total = len(sorted_stocks) large_cap = [s[0] for s in sorted_stocks[:total//3]] mid_cap = [s[0] for s in sorted_stocks[total//3:2*total//3]] small_cap = [s[0] for s in sorted_stocks[2*total//3:]] return large_cap, mid_cap, small_cap def sample_stocks(self, large_cap, mid_cap, small_cap, n=100): large_sample = np.random.choice(large_cap, min(n, len(large_cap)), replace=False) if large_cap else [] mid_sample = np.random.choice(mid_cap, min(n, len(mid_cap)), replace=False) if mid_cap else [] small_sample = np.random.choice(small_cap, min(n, len(small_cap)), replace=False) if small_cap else [] return list(large_sample) + list(mid_sample) + list(small_sample) def calculate_price_limits(self, price_data): price_data = price_data.copy() unique_codes = price_data[&#39;code&#39;].unique() security_types = { code: self.get_security_info_cached(code).type if self.get_security_info_cached(code) else &#39;normal&#39; for code in unique_codes } price_data[&#39;security_type&#39;] = price_data[&#39;code&#39;].map(security_types) price_data[&#39;price_limit_threshold&#39;] = 0.10 gem_mask = (price_data[&#39;security_type&#39;] == &#39;gem&#39;) & (price_data[&#39;date&#39;] >= &#39;2020-08-24&#39;) price_data.loc[gem_mask, &#39;price_limit_threshold&#39;] = 0.20 ks_mask = price_data[&#39;security_type&#39;] == &#39;ks&#39; price_data.loc[ks_mask, &#39;price_limit_threshold&#39;] = 0.20 bj_mask = price_data[&#39;security_type&#39;] == &#39;bj&#39; price_data.loc[bj_mask, &#39;price_limit_threshold&#39;] = 0.30 price_data[&#39;up_limit&#39;] = np.round(price_data[&#39;pre_close&#39;] * (1 + price_data[&#39;price_limit_threshold&#39;]), 2) price_data[&#39;down_limit&#39;] = np.round(price_data[&#39;pre_close&#39;] * (1 - price_data[&#39;price_limit_threshold&#39;]), 2) price_data[&#39;up_limit_hit&#39;] = ( (price_data[&#39;high&#39;] >= price_data[&#39;up_limit&#39;] - 0.015) & (price_data[&#39;low&#39;] <= price_data[&#39;up_limit&#39;] + 0.015) ).astype(int) price_data[&#39;down_limit_hit&#39;] = ( (price_data[&#39;low&#39;] <= price_data[&#39;down_limit&#39;] + 0.015) & (price_data[&#39;high&#39;] >= price_data[&#39;down_limit&#39;] - 0.015) ).astype(int) price_data[&#39;limit_one_way&#39;] = ( (price_data[&#39;up_limit_hit&#39;] == 1) & (price_data[&#39;low&#39;] == price_data[&#39;high&#39;]) & (price_data[&#39;open&#39;] == price_data[&#39;close&#39;]) ).astype(int) price_data.drop([&#39;security_type&#39;, &#39;up_limit&#39;, &#39;down_limit&#39;], axis=1, inplace=True) return price_data def mark_special_events_vectorized(self, price_data): price_data = price_data.copy() price_data[&#39;special_events&#39;] = &#39;&#39; for event_type in self.special_events.keys(): event_mask = price_data[&#39;date&#39;].isin(self.special_events[event_type]) price_data.loc[event_mask, &#39;special_events&#39;] = price_data.loc[event_mask, &#39;special_events&#39;] + event_type + &#39;,&#39; price_data[&#39;special_events&#39;] = price_data[&#39;special_events&#39;].str.rstrip(&#39;,&#39;) price_data[&#39;special_events&#39;] = price_data[&#39;special_events&#39;].replace(&#39;&#39;, np.nan) return price_data def mark_anomalies(self, price_data): """异常标记层:集成MAD、KDE、非参数波动率检测""" if price_data.empty: return price_data # 仅分析正常交易日 valid_mask = ( (price_data[&#39;suspended&#39;] == 0) & (price_data[&#39;up_limit_hit&#39;] == 0) & (price_data[&#39;down_limit_hit&#39;] == 0) & price_data[&#39;special_events&#39;].isna() ) valid_data = price_data[valid_mask].copy() if valid_data.empty: return price_data valid_data[&#39;return&#39;] = np.log(valid_data[&#39;close&#39;] / valid_data[&#39;pre_close&#39;]) # 初始化异常标记列 price_data[&#39;mad_anomaly&#39;] = 0 price_data[&#39;kde_anomaly&#39;] = 0 price_data[&#39;vol_anomaly&#39;] = 0 # MAD异常检测 for stock, group in valid_data.groupby(&#39;code&#39;): returns = group[&#39;return&#39;] if len(returns) < 10: continue median = returns.median() mad = np.median(np.abs(returns - median)) threshold = 5 * 1.4826 * mad anomaly_mask = np.abs(returns - median) > threshold anomaly_indices = group[anomaly_mask].index price_data.loc[anomaly_indices, &#39;mad_anomaly&#39;] = 1 # KDE异常检测 for stock, group in valid_data.groupby(&#39;code&#39;): X = group[[&#39;return&#39;, &#39;volume&#39;]].values if len(X) < 20: continue X_norm = (X - X.mean(axis=0)) / X.std(axis=0) return_kde = gaussian_kde(X_norm[:, 0]) volume_kde = gaussian_kde(X_norm[:, 1]) densities = return_kde(X_norm[:, 0]) * volume_kde(X_norm[:, 1]) threshold = np.percentile(densities, 1) anomaly_mask = densities < threshold anomaly_indices = group[anomaly_mask].index price_data.loc[anomaly_indices, &#39;kde_anomaly&#39;] = 1 # 非参数波动率异常检测 for stock, group in valid_data.groupby(&#39;code&#39;): returns = group[&#39;return&#39;] if len(returns) < 20: continue realized_vol = returns.rolling(5).std() realized_vol_no_na = realized_vol.dropna() if len(realized_vol_no_na) == 0: continue med_vol = realized_vol_no_na.median() mad_vol = np.median(np.abs(realized_vol_no_na - med_vol)) threshold = med_vol + 3 * 1.4826 * mad_vol anomaly_mask = realized_vol > threshold anomaly_indices = group[anomaly_mask].index price_data.loc[anomaly_indices, &#39;vol_anomaly&#39;] = 1 return price_data def load_price_data(self, stocks, start_date, end_date): trade_days = get_trade_days(start_date=start_date, end_date=end_date) logger.info(f"交易日数量: {len(trade_days)}") data_frames = [] batch_size = 100 total = len(stocks) for i in range(0, total, batch_size): batch = stocks[i:i+batch_size] logger.info(f"加载股票批次 {i//batch_size+1}/{(total-1)//batch_size+1} ({len(batch)}只股票)") try: batch_data = get_price( batch, start_date=trade_days[0], end_date=trade_days[-1], fields=[&#39;open&#39;, &#39;close&#39;, &#39;high&#39;, &#39;low&#39;, &#39;volume&#39;, &#39;pre_close&#39;], frequency=&#39;daily&#39;, panel=False, skip_paused=False, fq=&#39;pre&#39;, fill_paused=True ) if batch_data is None or batch_data.empty: logger.warning(f"批次加载失败,跳过此批次") continue if &#39;time&#39; in batch_data.columns: batch_data.rename(columns={&#39;time&#39;: &#39;date&#39;}, inplace=True) batch_data[&#39;date&#39;] = pd.to_datetime(batch_data[&#39;date&#39;]) batch_data[&#39;suspended&#39;] = (batch_data[&#39;volume&#39;] == 0).astype(int) batch_data.sort_values([&#39;code&#39;, &#39;date&#39;], inplace=True) batch_data[&#39;prev_suspended&#39;] = batch_data.groupby(&#39;code&#39;)[&#39;suspended&#39;].shift(1) batch_data[&#39;resumption_first_day&#39;] = ((batch_data[&#39;suspended&#39;] == 0) & (batch_data[&#39;prev_suspended&#39;] == 1)).astype(int) batch_data.drop(&#39;prev_suspended&#39;, axis=1, inplace=True) logger.info("向量化计算涨跌停标记...") batch_data = self.calculate_price_limits(batch_data) logger.info("向量化标记特殊事件...") batch_data = self.mark_special_events_vectorized(batch_data) logger.info("执行异常标记层...") batch_data = self.mark_anomalies(batch_data) data_frames.append(batch_data) except Exception as e: logger.error(f"批次加载失败: {str(e)},跳过此批次") if data_frames: combined = pd.concat(data_frames) return combined.set_index([&#39;date&#39;, &#39;code&#39;]).sort_index() return pd.DataFrame() def run_data_validation(data): """重构后的数据验证函数""" logger.info("\n" + "="*60) logger.info("开始运行数据验证测试") logger.info("="*60) data_reset = data.reset_index() results = { &#39;missing_values&#39;: {}, &#39;limit_issues&#39;: 0, &#39;event_mark_issues&#39;: {}, &#39;anomaly_mark_stats&#39;: {&#39;mad&#39;: 0, &#39;kde&#39;: 0, &#39;vol&#39;: 0}, &#39;special_cases&#39;: {&#39;zero_volume&#39;: 0} } # 1. 检查缺失值 null_counts = data_reset.isnull().sum() results[&#39;missing_values&#39;] = {col: count for col, count in null_counts.items() if count > 0} # 2. 检查涨跌停标记 # 获取涨停阈值 data_reset[&#39;calculated_threshold&#39;] = np.round( data_reset[&#39;pre_close&#39;] * (1 + data_reset[&#39;price_limit_threshold&#39;]), 2 ) false_negatives = data_reset[ (data_reset[&#39;high&#39;] >= data_reset[&#39;calculated_threshold&#39;] - 0.015) & (data_reset[&#39;up_limit_hit&#39;] == 0) & (data_reset[&#39;suspended&#39;] == 0) ] false_positives = data_reset[ (data_reset[&#39;high&#39;] < data_reset[&#39;calculated_threshold&#39;] - 0.015) & (data_reset[&#39;up_limit_hit&#39;] == 1) & (data_reset[&#39;suspended&#39;] == 0) ] results[&#39;limit_issues&#39;] = len(false_negatives) + len(false_positives) # 3. 检查特殊事件标记 special_events_db = { &#39;circuit_breaker&#39;: [&#39;2016-01-04&#39;, &#39;2016-01-07&#39;], &#39;major_events&#39;: [ &#39;2015-06-12&#39;, &#39;2015-07-08&#39;, &#39;2016-01-04&#39;, &#39;2016-01-07&#39;, &#39;2018-03-23&#39;, &#39;2019-05-06&#39;, &#39;2020-01-23&#39;, &#39;2020-02-03&#39;, &#39;2020-03-09&#39;, &#39;2020-03-12&#39;, &#39;2016-03-16&#39;, &#39;2020-03-18&#39;, &#39;2020-07-06&#39;, &#39;2021-01-04&#39;, &#39;2021-02-18&#39;, &#39;2021-07-26&#39;, &#39;2022-02-24&#39;, &#39;2022-03-07&#39;, &#39;2022-03-08&#39;, &#39;2022-03-09&#39;, &#39;2022-04-25&#39;, &#39;2022-10-24&#39;, &#39;2022-10-28&#39;, &#39;2023-01-30&#39; ], &#39;black_swan&#39;: [ &#39;2015-06-12&#39;, &#39;2018-03-23&#39;, &#39;2020-01-23&#39;, &#39;2020-03-09&#39;, &#39;2021-07-26&#39;, &#39;2022-02-24&#39; ], &#39;extreme_market&#39;: [ &#39;2015-06-19&#39;, &#39;2015-06-26&#39;, &#39;2015-07-03&#39;, &#39;2015-07-09&#39;, &#39;2015-08-24&#39;, &#39;2015-08-25&#39;, &#39;2016-01-04&#39;, &#39;2016-01-07&#39;, &#39;2018-10-11&#39;, &#39;2019-05-06&#39;, &#39;2020-02-03&#39;, &#39;2020-07-06&#39;, &#39;2020-07-16&#39;, &#39;2021-02-18&#39;, &#39;2021-07-27&#39;, &#39;2022-03-15&#39;, &#39;2022-04-25&#39;, &#39;2022-10-24&#39; ], &#39;gem_reform&#39;: [ &#39;2020-08-24&#39;, &#39;2020-08-27&#39;, &#39;2020-09-09&#39;, &#39;2021-02-18&#39;, &#39;2021-04-19&#39;, &#39;2021-08-24&#39; ] } for event_type, date_list in special_events_db.items(): dates = [pd.Timestamp(date) for date in date_list] marked = data_reset[data_reset[&#39;special_events&#39;].str.contains(event_type, na=False)] expected = len(dates) actual = marked[&#39;date&#39;].nunique() results[&#39;event_mark_issues&#39;][event_type] = abs(actual - expected) # 4. 统计异常标记 results[&#39;anomaly_mark_stats&#39;][&#39;mad&#39;] = data_reset[&#39;mad_anomaly&#39;].sum() results[&#39;anomaly_mark_stats&#39;][&#39;kde&#39;] = data_reset[&#39;kde_anomaly&#39;].sum() results[&#39;anomaly_mark_stats&#39;][&#39;vol&#39;] = data_reset[&#39;vol_anomaly&#39;].sum() # 5. 检查特殊情况 # 非停牌日零值 results[&#39;special_cases&#39;][&#39;zero_volume&#39;] = data_reset[ (data_reset[&#39;volume&#39;] == 0) & (data_reset[&#39;suspended&#39;] == 0) ].shape[0] # 输出验证结果 logger.info("验证结果:") if results[&#39;missing_values&#39;]: logger.warning(f"⚠️ 缺失值: {results[&#39;missing_values&#39;]}") else: logger.info("✅ 无缺失值") if results[&#39;limit_issues&#39;] > 0: logger.warning(f"⚠️ 涨跌停标记问题: {results[&#39;limit_issues&#39;]}处") else: logger.info("✅ 涨跌停标记正确") for event_type, issues in results[&#39;event_mark_issues&#39;].items(): if issues > 0: logger.warning(f"⚠️ {event_type}事件标记不匹配: 差异{issues}处") else: logger.info(f"✅ {event_type}事件标记正确") logger.info(f"异常标记统计 - MAD: {results[&#39;anomaly_mark_stats&#39;][&#39;mad&#39;]}, " f"KDE: {results[&#39;anomaly_mark_stats&#39;][&#39;kde&#39;]}, " f"波动率: {results[&#39;anomaly_mark_stats&#39;][&#39;vol&#39;]}") if results[&#39;special_cases&#39;][&#39;zero_volume&#39;] > 0: logger.warning(f"⚠️ 非停牌日零成交量: {results[&#39;special_cases&#39;][&#39;zero_volume&#39;]}处") else: logger.info("✅ 无非停牌日零成交量问题") logger.info("="*60) logger.info("数据验证测试完成") logger.info("="*60) return results def main(): logger.info("="*60) logger.info("开始执行数据加载") logger.info("="*60) loader = StockDataLoader() logger.info("获取2013年前上市且在2013~2023年间未退市的股票...") all_stocks = loader.get_all_stocks() logger.info(f"共找到{len(all_stocks)}只符合条件的股票") logger.info("过滤2014~2023年间被ST的股票...") non_st_stocks = loader.filter_st_stocks(all_stocks, &#39;2014-01-01&#39;, &#39;2023-12-31&#39;) logger.info(f"过滤后剩余股票: {len(non_st_stocks)}") logger.info("按市值分组...") large_cap, mid_cap, small_cap = loader.categorize_stocks(non_st_stocks) logger.info(f"分组完成: 大盘股({len(large_cap)}), 中盘股({len(mid_cap)}), 微盘股({len(small_cap)})") logger.info("随机抽取股票...") sampled_stocks = loader.sample_stocks(large_cap, mid_cap, small_cap, n=100) logger.info(f"抽样完成: 共选取{len(sampled_stocks)}只股票") logger.info("开始加载2014-2023年价格数据(前复权)...") price_data = loader.load_price_data(sampled_stocks, &#39;2014-01-01&#39;, &#39;2023-12-31&#39;) if price_data.empty: logger.error("数据加载失败,无有效数据") return logger.info(f"数据加载完成,共{len(price_data)}条记录") run_data_validation(price_data) logger.info("="*60) logger.info("数据加载和测试完成") logger.info("="*60) if __name__ == "__main__": main() 当前代码有什么问题?
08-16
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值