from sklearn.decomposition
import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
通过计算特征协方差矩阵的特征向量,找出数据方差最大的方向,并将数据投影到这些方向。
示例代码
from sklearn.decomposition
import PCA
import numpy as np # 示例数据
X = np.array([[2.5,2.4],[0.5,0.7],[2.2,2.9],[1.9,2.2],[3.1,3.0]])# 主成分分析降维
pca = PCA(n_components=1)
X_pca = pca.fit_transform(X)print("Original Data Shape:", X.shape)print("Reduced Data Shape:", X_pca.shape)
21.数据建模:线性回归
线性回归是一种最简单的监督学习算法,拟合特征与目标变量之间的线性关系。
语法
from sklearn.linear_model
import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
线性回归通过最小化残差平方和,找到一组最佳线性权重。
示例代码
from sklearn.linear_model
import LinearRegression
import numpy as np # 示例数据
X = np.array([[1],[2],[3],[4]])
y = np.array([2.5,3.5,5.0,6.0])# 模型训练
model = LinearRegression()
model.fit(X, y)# 预测
X_test = np.array([[5]])
prediction = model.predict(X_test)print("Predicted Value for X=5:", prediction)
22.数据建模:决策树
决策树通过构建基于特征分裂的树状结构,适用于分类和回归任务。
语法
from sklearn.tree
import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)
决策树使用信息增益或基尼系数作为分裂准则,递归地划分特征空间。
示例代码
from sklearn.tree
import DecisionTreeClassifier
import numpy as np # 示例数据
X = np.array([[1],[2],[3],[4]])
y = np.array([0,0,1,1])# 模型训练
model = DecisionTreeClassifier(max_depth=2)
model.fit(X, y)# 预测
prediction = model.predict([[2.5]])print("Predicted Class for X=2.5:", prediction)
23.模型评估:交叉验证
交叉验证用于评估模型的泛化性能,常见的是k折交叉验证。
语法
from sklearn.model_selection
import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
将数据划分为k个子集,每次使用一个子集作为验证集,其余子集作为训练集,计算模型的平均性能。
示例代码
from sklearn.model_selection
import cross_val_score
from sklearn.linear_model
import LogisticRegression
import numpy as np # 示例数据
X = np.random.rand(100,3)
y = np.random.randint(0,2,100)# 模型训练与交叉验证
model = LogisticRegression()
scores = cross_val_score(model, X, y, cv=5)print("Cross-Validation Scores:", scores)print("Mean Score:", scores.mean())
import torch.nn as nn
model = nn.Sequential(nn.Linear(input_size, output_size), nn.ReLU(),...)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr)
PyTorch通过模块化设计,支持定义前向传播、反向传播和参数更新的全过程。
示例代码
import torch
import torch.nn as nn
import torch.optim as optim # 示例数据
X_train = torch.rand(100,3)
y_train = torch.randint(0,2,(100,))# 模型定义
model = nn.Sequential(nn.Linear(3,16),nn.ReLU(),nn.Linear(16,8),nn.ReLU(),nn.Linear(8,1),nn.Sigmoid())# 损失函数与优化器
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)# 训练模型 for epoch inrange(10):
optimizer.zero_grad()
output = model(X_train).squeeze()
loss = loss_fn(output, y_train.float())
loss.backward()
optimizer.step()print(f"Epoch {epoch+1}, Loss: {loss.item()}")
33.时间序列分析:ARIMA模型
ARIMA是一种经典的时间序列建模方法,适用于平稳数据。
语法
from statsmodels.tsa.arima.model
import ARIMA
model = ARIMA(data, order=(p, d, q))
results = model.fit()
ARIMA由自回归(AR)、差分(I)和移动平均(MA)三部分组成,通过参数调整捕获时间序列特征。
示例代码
import pandas as pd
from statsmodels.tsa.arima.model
import ARIMA # 示例时间序列数据
data = pd.Series([100,120,130,125,150,160])# 构建ARIMA模型
model = ARIMA(data, order=(1,1,1))
results = model.fit()# 输出结果 print(results.summary())
34.自然语言处理:文本清理
文本清理是NLP中的基础操作,包括去除标点符号、转换大小写、移除停用词等。
语法
import re
cleaned_text = re.sub(r'[^\w\s]','', text.lower())
文本清理统一输入数据的格式,去除冗余信息,减少模型训练的复杂性。
示例代码
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')# 示例文本
text ="Hello World! This is an example of text preprocessing in NLP."# 清理步骤
text = text.lower()# 转换为小写
text = re.sub(r'[^\w\s]','', text)# 去除标点
stop_words =set(stopwords.words('english'))
cleaned_text =' '.join(word for word in text.split()if word notin stop_words)print("Cleaned Text:")print(cleaned_text)
35.自然语言处理:词袋模型(BoW)
词袋模型是将文本表示为词频向量的技术,适合基础文本分类任务。
语法
from sklearn.feature_extraction.text
import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
通过统计每个单词的出现频率,将文本表示为稀疏矩阵,忽略单词的顺序信息。
示例代码
from sklearn.feature_extraction.text
import CountVectorizer # 示例文本
texts =["I love Python programming.","Python is great for data science!"]# 词袋模型
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)print("Feature Names:", vectorizer.get_feature_names_out())print("Word Frequency Matrix:")print(X.toarray())
36.自然语言处理:TF-IDF
TF-IDF(词频-逆文档频率)是一种加权方式,用于衡量单词对文本的独特贡献度。
语法
from sklearn.feature_extraction.text
import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
from sklearn.feature_extraction.text
import TfidfVectorizer # 示例文本
texts =["I love Python programming.","Python is great for data science!"]# TF-IDF计算
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)print("Feature Names:", vectorizer.get_feature_names_out())print("TF-IDF Matrix:")print(X.toarray())
37.文本分类:朴素贝叶斯
朴素贝叶斯是一种基于概率的分类算法,假设特征之间相互独立。
语法
from sklearn.naive_bayes
import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
根据贝叶斯定理,预测类别为 ( P(y|X) \propto P(X|y)P(y) )。
示例代码
from sklearn.naive_bayes
import MultinomialNB
from sklearn.feature_extraction.text
import CountVectorizer # 示例文本
texts =["I love Python programming.","Python is great for data science!","I hate bugs."]
labels =[1,1,0]# 词袋表示
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)# 训练朴素贝叶斯
model = MultinomialNB()
model.fit(X, labels)# 测试
test_texts =["Python is fun!","I hate errors."]
X_test = vectorizer.transform(test_texts)
predictions = model.predict(X_test)print("Predictions:", predictions)
38.深度学习:LSTM(长短期记忆网络)
LSTM是一种循环神经网络(RNN),设计用于捕捉长时间序列依赖关系,适合处理时间序列或文本数据。
语法
from tensorflow.keras.models
import Sequential
from tensorflow.keras.layers
import LSTM, Dense
model = Sequential([LSTM(units, input_shape=(timesteps, features)),Dense(output_units, activation)])
LSTM通过“遗忘门”、“输入门”和“输出门”控制信息流动,解决传统RNN中梯度消失或爆炸问题。
示例代码
import numpy as np
from tensorflow.keras.models
import Sequential
from tensorflow.keras.layers
import LSTM, Dense # 示例时间序列数据
X_train = np.random.rand(100,10,1)# 100条样本,10个时间步,每步1个特征
y_train = np.random.randint(0,2,100)# 构建LSTM模型
model =
Sequential([LSTM(32, input_shape=(10,1), activation='tanh'),Dense(1, activation='sigmoid')])# 编译模型
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])# 训练模型
model.fit(X_train, y_train, epochs=10, batch_size=8)
import optuna defobjective(trial):...
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
通过迭代优化,Optuna基于以往试验结果更新候选参数的概率分布,逐步接近最优参数组合。
示例代码
import optuna
from sklearn.datasets
import load_iris
from sklearn.ensemble
import RandomForestClassifier
from sklearn.model_selection
import cross_val_score
from sklearn.model_selection
import train_test_split # 数据
data = load_iris()
X, y = data.data, data.target # 定义目标函数 defobjective(trial):
n_estimators = trial.suggest_int("n_estimators",10,200)
max_depth = trial.suggest_int("max_depth",1,32)
min_samples_split = trial.suggest_int("min_samples_split",2,20)
clf = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
min_samples_split=min_samples_split,random_state=42,)return cross_val_score(clf, X, y, cv=3).mean()# 创建并优化Study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)# 输出最优参数 print("Best hyperparameters:", study.best_params)
49.自然语言处理:情感分析
情感分析通过分类模型判断文本的情感倾向,广泛应用于舆情监测、用户反馈分析等领域。
语法
from transformers
import pipeline
classifier = pipeline("sentiment-analysis")
classifier("I love programming!")
通过预训练语言模型(如BERT或DistilBERT)进行微调,将文本向量映射到情感类别。
示例代码
from transformers
import pipeline # 加载预训练情感分析管道
classifier = pipeline("sentiment-analysis")# 示例文本
text ="I am very happy with this product!"# 情感分类
result = classifier(text)print("Sentiment Analysis Result:", result)