MSSQL随机数概率测试

随机概率测试

创建一个表统计
create table t_test(ip char(15))

--truncate table t_test;

declare @i int ;
set @i=0;

--循环1000次
while @i<1000
begin
--2015-1-20
declare @randnum int;
set @randnum=cast(ceiling(rand() * 10) as int)
--概率让47的IP出现的概率在30%以内
if(@randnum%3=0)
insert into t_test values('192.168.1.47');
else
insert into t_test values('192.168.1.40');

set @i=@i+1;

end

--统计记录出现的次数
select COUNT(1),ip from t_test group by ip

转载于:https://www.cnblogs.com/cyun/p/4236754.html

import jieba import numpy as np import torch import sqlalchemy from sqlalchemy import create_engine, text, Column, Integer, String from sqlalchemy.orm import declarative_base, sessionmaker from sqlalchemy.dialects.mssql import VARBINARY from transformers import AutoModel, AutoTokenizer import re # 配置参数(保持不变) MODEL_PATH = r"C:\Users\Lenovo\nepu_qa_project\local_models\bge-small-zh" DB_CONN_STR = "mssql+pyodbc://sa:123456@LAPTOP-6KK5U1P0/nepu_qa_db?driver=ODBC+Driver+17+for+SQL+Server" # 初始化数据库(保持不变) Base = declarative_base() class NepuData(Base): __tablename__ = 'nepudata' id = Column(Integer, primary_key=True) url = Column(String) full_content = Column(String) scraped_at = Column(String) vector = Column(VARBINARY(8000)) engine = create_engine(DB_CONN_STR) Session = sessionmaker(bind=engine) # 加载模型 print("加载模型中...") tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) model = AutoModel.from_pretrained(MODEL_PATH) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) # 改进的预处理函数 def preprocess_text(text): """增强版文本预处理""" if not text: return "" # 1. 移除HTML标签 text = re.sub(r'<[^>]+>', '', text) # 2. 移除固定前缀 text = re.sub(r'标题\s*:\s*', '', text) text = re.sub(r'-\s*东北石油大学', '', text) text = re.sub(r'当前位置\s*:.*?-\s*', '', text) # 3. 移除系统提示类内容 if "系统提示" in text and "抱歉" in text: return "系统提示内容已移除" # 4. 分割为行并去重 lines = text.split('\n') unique_lines = [] seen = set() for line in lines: clean_line = line.strip() if clean_line and len(clean_line) > 10 and clean_line not in seen: seen.add(clean_line) unique_lines.append(clean_line) # 5. 保留核心内容 return " ".join(unique_lines[:15]) # 改进的向量化函数 def text_to_vector(text: str) -> bytes: """加权池化向量化方法""" # 预处理 cleaned_text = preprocess_text(text) # 空内容处理 if not cleaned_text or "已移除" in cleaned_text: return np.zeros(768).astype(np.float32).tobytes() # 分词 words = jieba.lcut(cleaned_text) processed_text = " ".join(words) # 向量化 inputs = tokenizer( processed_text, padding=True, truncation=True, max_length=512, return_tensors="pt" ).to(device) with torch.no_grad(): outputs = model(**inputs) # 加权平均池化 weights = inputs['attention_mask'].float().to(device) embeddings = (outputs.last_hidden_state * weights.unsqueeze(-1)).sum(dim=1) / weights.sum(dim=1, keepdim=True) return embeddings[0].cpu().numpy().astype(np.float32).tobytes() def final_vector_test(): """最终向量质量测试""" session = Session() try: records = session.execute(text( "SELECT id, full_content, vector FROM nepudata" )).fetchall() print("\n最终向量质量报告:") print(f"共 {len(records)} 条记录") # 分类统计 content_types = {} for r in records: content = r[1] or "" if "新闻" in content: content_types.setdefault("新闻", []).append(r[0]) elif "通知" in content: content_types.setdefault("通知", []).append(r[0]) elif "公告" in content: content_types.setdefault("公告", []).append(r[0]) else: content_types.setdefault("其他", []).append(r[0]) print("\n内容类型分布:") for ctype, ids in content_types.items(): print(f" {ctype}: {len(ids)}条") # 提取向量 vectors = [] for r in records: if r[2]: vectors.append(np.frombuffer(r[2], dtype=np.float32)) # 随机相似度测试 print("\n随机样本相似度:") for i in range(5): idx1, idx2 = np.random.choice(len(vectors), np.random.choice(len(vectors))) sim = np.dot(vectors[idx1], vectors[idx2]) / (np.linalg.norm(vectors[idx1]) * np.linalg.norm(vectors[idx2])) print(f" 记录 {records[idx1][0]} 和 {records[idx2][0]}: {sim:.4f}") # 类内类间相似度 print("\n类内类间相似度对比:") for ctype1, ids1 in content_types.items(): for ctype2, ids2 in content_types.items(): if len(ids1) > 1 and len(ids2) > 1: vecs1 = [vectors[records.index(r)] for r in records if r[0] in ids1] vecs2 = [vectors[records.index(r)] for r in records if r[0] in ids2] # 计算平均相似度 similarities = [] for v1 in vecs1: for v2 in vecs2: sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) similarities.append(sim) avg_sim = np.mean(similarities) print(f" {ctype1}({len(ids1)}) vs {ctype2}({len(ids2)}): {avg_sim:.4f}") finally: session.close() if __name__ == "__main__": print("运行最终向量质量测试...") final_vector_test() 这是代码
最新发布
07-08
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值