Python 人工智能与数据科学实战

# Python 人工智能与数据科学实战

![Python AI与数据科学](https://www.python.org/static/community_logos/python-powered-h-140x182.png)

## 机器学习入门

### Scikit-learn 基础

```python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# 加载数据集
iris = load_iris()
X, y = iris.data, iris.target

# 数据预处理
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 训练模型
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# 评估模型
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))
```

![机器学习流程](https://scikit-learn.org/stable/_static/ml_map.png)

### 特征工程实战

```python
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# 创建示例数据
data = pd.DataFrame({
    'text': ['Python is awesome', 'Machine learning is fun', 'Deep learning is fascinating'],
    'label': [1, 0, 0]
})

# 文本特征提取
tfidf = TfidfVectorizer(max_features=100)
X = tfidf.fit_transform(data['text'])

# 特征选择
selector = SelectKBest(chi2, k=5)
X_new = selector.fit_transform(X, data['label'])

print(f"原始特征数: {X.shape[1]}")
print(f"选择后特征数: {X_new.shape[1]}")
```

## 深度学习框架

### TensorFlow 2.0 入门

```python
import tensorflow as tf
from tensorflow.keras import layers

# 加载数据
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# 构建模型
model = tf.keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(10, activation='softmax')
])

# 编译模型
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# 训练模型
model.fit(x_train, y_train, epochs=5)

# 评估模型
model.evaluate(x_test, y_test, verbose=2)
```

![神经网络结构](https://www.tensorflow.org/images/keras/overview_of_keras_and_tensorflow.png)

### PyTorch 实战

```python
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

# 定义网络结构
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = x.view(-1, 28*28)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# 加载数据
transform = transforms.Compose([transforms.ToTensor()])
train_set = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)

# 初始化模型
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 训练循环
for epoch in range(5):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
```

## 自然语言处理(NLP)

### 文本预处理

```python
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # 分词
    tokens = word_tokenize(text.lower())
    
    # 去除标点符号
    tokens = [word for word in tokens if word not in string.punctuation]
    
    # 去除停用词
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # 词形还原
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

sample_text = "Python is an interpreted, high-level programming language."
print(preprocess_text(sample_text))
```

### 情感分析实战

```python
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# 示例数据
texts = ["I love Python", "I hate Java", "Python is great", "Java is confusing"]
labels = [1, 0, 1, 0]  # 1=正面, 0=负面

# 构建管道
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

# 训练模型
model.fit(texts, labels)

# 预测新文本
test_texts = ["Python is confusing", "Java is great"]
predictions = model.predict(test_texts)
print(predictions)  # 输出: [0 1]
```

![NLP处理流程](https://miro.medium.com/max/1400/1*4G__SV580CxFj78o9yUXuQ.png)

## 计算机视觉

### OpenCV基础

```python
import cv2
import matplotlib.pyplot as plt

# 读取图像
image = cv2.imread('example.jpg')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# 图像处理
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
edges = cv2.Canny(blurred, 50, 150)

# 显示结果
plt.figure(figsize=(12, 4))
plt.subplot(131), plt.imshow(image), plt.title('Original')
plt.subplot(132), plt.imshow(blurred, cmap='gray'), plt.title('Blurred')
plt.subplot(133), plt.imshow(edges, cmap='gray'), plt.title('Edges')
plt.show()
```

### 图像分类实战

```python
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

# 数据增强
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True)

# 加载预训练模型
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# 冻结卷积层
for layer in base_model.layers:
    layer.trainable = False

# 添加全连接层
model = Sequential([
    base_model,
    Flatten(),
    Dense(256, activation='relu'),
    Dense(1, activation='sigmoid')
])

# 编译模型
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# 训练模型
train_generator = train_datagen.flow_from_directory(
    'data/train',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary')

model.fit(train_generator, epochs=10)
```

![计算机视觉应用](https://opencv.org/wp-content/uploads/2021/01/OpenCV-logo.png)

## 大数据处理

### PySpark基础

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# 初始化Spark会话
spark = SparkSession.builder \
    .appName("Python Spark Example") \
    .getOrCreate()

# 创建DataFrame
data = [("Alice", 34), ("Bob", 45), ("Charlie", 28)]
df = spark.createDataFrame(data, ["Name", "Age"])

# 数据操作
df.show()
df.filter(col("Age") > 30).show()
df.select(avg("Age")).show()

# 停止Spark会话
spark.stop()
```

### Dask并行计算

```python
import dask.dataframe as dd
import dask.array as da

# 创建大型数据集
x = da.random.random((10000, 10000), chunks=(1000, 1000))

# 并行计算
result = (x + x.T).mean(axis=0)

# 执行计算
computed_result = result.compute()
print(computed_result[:5])
```

![大数据生态](https://spark.apache.org/images/spark-logo-trademark.png)

## 自动化机器学习

### AutoML实战

```python
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from autosklearn.classification import AutoSklearnClassifier

# 加载数据
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.3, random_state=42)

# 创建AutoML分类器
automl = AutoSklearnClassifier(
    time_left_for_this_task=120,  # 秒
    per_run_time_limit=30,
    n_jobs=-1
)

# 训练模型
automl.fit(X_train, y_train)

# 评估模型
print(f"模型分数: {automl.score(X_test, y_test)}")
print(f"最佳模型: {automl.show_models()}")
```

## 模型部署

### Flask API部署

```python
from flask import Flask, request, jsonify
import pickle
import pandas as pd

app = Flask(__name__)

# 加载模型
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

@app.route('/predict', methods=['POST'])
def predict():
    # 获取输入数据
    data = request.get_json()
    df = pd.DataFrame(data, index=[0])
    
    # 预测
    prediction = model.predict(df)
    
    # 返回结果
    return jsonify({
        'prediction': int(prediction[0]),
        'probability': float(model.predict_proba(df).max())
    })

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)
```

### Docker容器化部署

```dockerfile
FROM python:3.8-slim

WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app"]

EXPOSE 5000
```

![模型部署流程](https://www.kubeflow.org/docs/images/pipelines-sdk.png)

## 数据可视化进阶

### Matplotlib高级技巧

```python
import matplotlib.pyplot as plt
import numpy as np

# 创建数据
x = np.linspace(0, 10, 100)
y1 = np.sin(x)
y2 = np.cos(x)

# 创建图形
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# 第一个子图
ax1.plot(x, y1, label='sin(x)', color='blue', linestyle='--')
ax1.set_title('Sine Wave')
ax1.set_xlabel('X axis')
ax1.set_ylabel('Y axis')
ax1.legend()
ax1.grid(True)

# 第二个子图
ax2.plot(x, y2, label='cos(x)', color='red', linestyle='-')
ax2.set_title('Cosine Wave')
ax2.set_xlabel('X axis')
ax2.set_ylabel('Y axis')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.savefig('advanced_plot.png')
plt.show()
```

### 交互式可视化

```python
import plotly.express as px
import pandas as pd

# 创建数据
df = pd.DataFrame({
    'Country': ['USA', 'China', 'Japan', 'Germany', 'UK'],
    'GDP': [21.43, 14.34, 5.08, 3.85, 2.83],
    'Population': [328, 1393, 126, 83, 67]
})

# 创建交互式图表
fig = px.scatter(df, x='Population', y='GDP', size='GDP', color='Country',
                 hover_name='Country', size_max=60,
                 title='GDP vs Population by Country')

fig.show()
```

![交互式可视化](https://plotly.com/python/static/images/plotly-logo.png)

## 结语与学习路径

![AI学习路径](https://www.python.org/static/community_logos/python-logo-master-v3-TM.png)

通过这七篇系列教程,你已经掌握了Python在人工智能和数据科学领域的核心应用。接下来可以:

1. **深入研究领域**:
   - 自然语言处理:Transformer、BERT等先进模型
   - 计算机视觉:目标检测、图像分割
   - 强化学习:OpenAI Gym、Stable Baselines

2. **参与竞赛**:
   - Kaggle数据科学竞赛
   - AI Challenger等中文竞赛

3. **工业级应用**:
   - 模型服务化与高性能推理
   - 大规模特征工程
   - 在线学习系统

4. **学术研究**:
   - 阅读顶级会议论文(NeurIPS, ICML, CVPR等)
   - 复现经典论文代码
   - 贡献开源AI项目

Python在AI领域的地位无可替代,持续学习和实践将助你成为行业专家!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值