数据处理专题（十一）-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_46281518/article/details/146269432

大数据处理

目标

学会处理大规模数据。‍

学习内容

Dask 基础

实践：使用 Dask 处理一个大型 CSV 文件‍

代码示例

1. 导入必要的库

import dask.dataframe as ddimport pandas as pdimport numpy as npimport osimport matplotlib.pyplot as plt

2. 创建示例大型 CSV 文件

为了演示方便，我们先创建一个较大的 CSV 文件。实际应用中，你可以直接使用现有的大型 CSV 文件。

# 创建示例大型 CSV 文件def create_large_csv(file_path, num_rows=1000000):    data = {        'id': range(num_rows),        'value1': np.random.rand(num_rows),        'value2': np.random.rand(num_rows),        'category': np.random.choice(['A', 'B', 'C'], size=num_rows)    }    df = pd.DataFrame(data)    df.to_csv(file_path, index=False)file_path = 'large_dataset.csv'create_large_csv(file_path)print(f"创建的大型 CSV 文件: {file_path}")

3. 使用 Dask 读取大型 CSV 文件

# 使用 Dask 读取大型 CSV 文件dask_df = dd.read_csv(file_path)print(f"Dask DataFrame 的前几行: \n{dask_df.head()}")

4. 数据探索

查看数据的基本信息

# 查看数据的基本信息print(f"数据的形状: {dask_df.shape}")print(f"数据的列名: {dask_df.columns}")print(f"数据的描述统计: \n{dask_df.describe().compute()}")

查看特定列的唯一值

# 查看特定列的唯一值unique_categories = dask_df['category'].unique().compute()print(f"类别列的唯一值: {unique_categories}")

5. 数据清洗

处理缺失值

# 检查每列的缺失值数量missing_values = dask_df.isnull().sum().compute()print(f"每列的缺失值数量: \n{missing_values}")# 填充缺失值dask_df = dask_df.fillna(dask_df.mean().compute())print(f"处理缺失值后的数据: \n{dask_df.head()}")

过滤数据

# 过滤数据filtered_df = dask_df[dask_df['value1'] > 0.5]print(f"过滤后的数据: \n{filtered_df.head()}")

6. 数据聚合

按类别分组并计算平均值

# 按类别分组并计算平均值grouped_df = filtered_df.groupby('category').mean().compute()print(f"按类别分组并计算平均值: \n{grouped_df}")

7. 数据可视化

使用 Pandas 进行可视化

# 将 Dask DataFrame 转换为 Pandas DataFramepandas_df = filtered_df.compute()# 绘制直方图pandas_df['value1'].hist(bins=30)plt.xlabel('Value1')plt.ylabel('Frequency')plt.title('Value1 的分布')plt.show()

8. 保存处理后的数据

保存为新的 CSV 文件

# 保存处理后的数据为新的 CSV 文件output_file_path = 'processed_large_dataset.csv'filtered_df.to_csv(output_file_path, single_file=True, index=False)print(f"处理后的数据已保存到: {output_file_path}")

实践

# 导入必要的库import dask.dataframe as ddimport pandas as pdimport numpy as npimport osimport matplotlib.pyplot as plt# 创建示例大型 CSV 文件def create_large_csv(file_path, num_rows=1000000):    data = {        'id': range(num_rows),        'value1': np.random.rand(num_rows),        'value2': np.random.rand(num_rows),        'category': np.random.choice(['A', 'B', 'C'], size=num_rows)    }    df = pd.DataFrame(data)    df.to_csv(file_path, index=False)file_path = 'large_dataset.csv'create_large_csv(file_path)print(f"创建的大型 CSV 文件: {file_path}")# 使用 Dask 读取大型 CSV 文件dask_df = dd.read_csv(file_path)print(f"Dask DataFrame 的前几行: \n{dask_df.head()}")# 查看数据的基本信息print(f"数据的形状: {dask_df.shape}")print(f"数据的列名: {dask_df.columns}")print(f"数据的描述统计: \n{dask_df.describe().compute()}")# 查看特定列的唯一值unique_categories = dask_df['category'].unique().compute()print(f"类别列的唯一值: {unique_categories}")# 处理缺失值missing_values = dask_df.isnull().sum().compute()print(f"每列的缺失值数量: \n{missing_values}")# 填充缺失值dask_df = dask_df.fillna(dask_df.mean().compute())print(f"处理缺失值后的数据: \n{dask_df.head()}")# 过滤数据filtered_df = dask_df[dask_df['value1'] > 0.5]print(f"过滤后的数据: \n{filtered_df.head()}")# 按类别分组并计算平均值grouped_df = filtered_df.groupby('category').mean().compute()print(f"按类别分组并计算平均值: \n{grouped_df}")# 将 Dask DataFrame 转换为 Pandas DataFramepandas_df = filtered_df.compute()# 绘制直方图pandas_df['value1'].hist(bins=30)plt.xlabel('Value1')plt.ylabel('Frequency')plt.title('Value1 的分布')plt.show()# 保存处理后的数据为新的 CSV 文件output_file_path = 'processed_large_dataset.csv'filtered_df.to_csv(output_file_path, single_file=True, index=False)print(f"处理后的数据已保存到: {output_file_path}")

小结

通过今天的练习，你应该已经学会了如何使用 Dask 处理大规模数据。Dask 是一个强大的工具，可以帮助你在内存有限的情况下处理大型数据集。

自然语言处理

目标

学会基本的自然语言处理技术。‍

学习内容

文本预处理（分词、去停用词）

TF-IDF‍

代码示例

1. 导入必要的库

import pandas as pdimport numpy as npimport reimport jiebafrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.model_selection import train_test_split

2. 创建示例文本数据集

# 创建示例文本数据集data = {    'text': [        "这是一个示例句子，用于展示文本预处理。",        "自然语言处理是一门非常有趣的学科。",        "通过分词和去停用词，我们可以提取文本的重要特征。",        "TF-IDF 是一种常用的文本特征提取方法。"    ],    'label': [1, 0, 1, 0]}df = pd.DataFrame(data)print(f"示例数据集: \n{df}")

3. 文本预处理

分词

# 使用 jieba 进行分词def tokenize(text):    return " ".join(jieba.cut(text))df['tokenized_text'] = df['text'].apply(tokenize)print(f"分词后的数据集: \n{df}")

去停用词

# 定义停用词列表stopwords = set(['的', '是', '一', '这', '用', '可以', '我们', '通过'])# 去除停用词def remove_stopwords(text):    return " ".join([word for word in text.split() if word not in stopwords])df['cleaned_text'] = df['tokenized_text'].apply(remove_stopwords)print(f"去除停用词后的数据集: \n{df}")

4. 计算 TF-IDF

初始化 TF-IDF 向量化器

# 初始化 TF-IDF 向量化器tfidf_vectorizer = TfidfVectorizer()计算 TF-IDF# 计算 TF-IDFtfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])print(f"TF-IDF 矩阵的形状: {tfidf_matrix.shape}")# 获取特征名称feature_names = tfidf_vectorizer.get_feature_names_out()print(f"特征名称: {feature_names}")# 将 TF-IDF 矩阵转换为 DataFrametfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)print(f"TF-IDF 矩阵: \n{tfidf_df}")

实践

对一个文本数据集进行预处理和 TF-IDF 计算。

# 导入必要的库import pandas as pdimport numpy as npimport reimport jiebafrom sklearn.feature_extraction.text import TfidfVectorizer# 创建示例文本数据集data = {    'text': [        "这是一个示例句子，用于展示文本预处理。",        "自然语言处理是一门非常有趣的学科。",        "通过分词和去停用词，我们可以提取文本的重要特征。",        "TF-IDF 是一种常用的文本特征提取方法。"    ],    'label': [1, 0, 1, 0]}df = pd.DataFrame(data)print(f"示例数据集: \n{df}")# 使用 jieba 进行分词def tokenize(text):    return " ".join(jieba.cut(text))df['tokenized_text'] = df['text'].apply(tokenize)print(f"分词后的数据集: \n{df}")# 定义停用词列表stopwords = set(['的', '是', '一', '这', '用', '可以', '我们', '通过'])# 去除停用词def remove_stopwords(text):    return " ".join([word for word in text.split() if word not in stopwords])df['cleaned_text'] = df['tokenized_text'].apply(remove_stopwords)print(f"去除停用词后的数据集: \n{df}")# 初始化 TF-IDF 向量化器tfidf_vectorizer = TfidfVectorizer()# 计算 TF-IDFtfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])print(f"TF-IDF 矩阵的形状: {tfidf_matrix.shape}")# 获取特征名称feature_names = tfidf_vectorizer.get_feature_names_out()print(f"特征名称: {feature_names}")# 将 TF-IDF 矩阵转换为 DataFrametfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)print(f"TF-IDF 矩阵: \n{tfidf_df}")