Python,pandas遍历csv文件,删除中文字符，特殊字符，将中文符号转换为英文符号

最新推荐文章于 2025-10-16 08:27:33 发布

原创最新推荐文章于 2025-10-16 08:27:33 发布 · 1.9w 阅读

56 ·

CC 4.0 BY-SA版权

OPENCV积累专栏收录该内容

7 篇文章

订阅专栏

本文介绍了使用Pandas处理CSV文件时如何读取、删除中文字符和特殊符号。讨论了read_csv的参数，如skip_blank_lines和skiprows，以及处理DtypeWarning的方法。此外，还提到了to_csv的保存功能，iterrows和iteritems的使用，以及loc的字符索引。在数据清理方面，文章展示了去除UTF-8数据中的中文字符和转换符号的技巧，并探讨了利用多进程加速处理的可能性。

部署运行你感兴趣的模型镜像

Pandas：Python Data Analysis Library

为了解决数据分析任务创建

数据结果：
***Series:***一维数组：与Numpy中的一维array类似。二者与Python基本的数据结构List也很相近，其区别是：List中的元素可以是不同的数据类型，而Array和Series中则只允许存储相同的数据类型，这样可以更有效的使用内存，提高运算效率。
***Time-Series:***以时间为索引的Series
***DataFrame:***二维的表格型数据结构，可以理解为Series的容器
***Panel:***三维数组，可以理解为DataFrame的容器

Pandas-read_csv()

函数参数详解
skip_blank_lines : boolean, default True
如果为True，则跳过空行；否则记为NaN。

skiprows : list-like or integer, default None
需要忽略的行数（从文件开始处算起），或需要跳过的行号列表（从0开始）。

获取标签，索引

 csv_file = pandas.read_csv('.csv')
 print(csv_file.index)#获取索引
 print(csv_file.columns)#获取标签

read_csv()遇到的问题

DtypeWarning: Columns (1,5,7,16,…) have mixed types. Specify dtype option on import or set low_memory=False.

两个解决方案：
参考：mixed types问题

# 1.设置read_csv的dtype参数，指定字段的数据类型
pd.read_csv(sio, dtype={"user_id": int, "username": object})
# 2.设置read_csv的low_memory参数为False
 pd.read_csv(sio, low_memory=False})

low_memory:解决办法的原因： low_memory=False 参数设置后，pandas会一次性读取csv中的所有数据，然后对字段的数据类型进行唯一的一次猜测。这样就不会导致同一字段的Mixed types问题了。
但是这种方式真的非常不好，一旦csv文件过大，就会内存溢出；推荐第一种解决方案

Pandas-to_csv()

保存

iterrows()/行，iteritems()/列

返回一个generator对象，是可迭代对象

Pandas-loc

Pandas库中有iloc和loc以及ix可以用来索引数据.
**iloc:**针对数字索引
**loc:**字符索引
**ix:**混合索引

中文字符，特殊字符的去除

读入的数据位UTF-8格式，剔除其中的中文字符，特殊字符

下面放入字符剔除及中英文符号转换的函数，便利自己的数组调用即可

#将中文标点符号转换为英文标点符号
#https://blog.youkuaiyun.com/nanbei2463776506/article/details/82967140
def C_trans_to_E(string):
    E_pun = u',.!?[]()<>"\''
    C_pun = u'，。！？【】（）《》“‘'
    #ord返回ASCII码对应的int
    #zip将合并为列表，元素为元祖，元祖为对应位置所有元素依次的集合，如这种形式[(',','，')...]
    #s生成对应字典
    table= {ord(f):ord(t) for f,t in zip(C_pun,E_pun)}
    #将字符传对应转换
    return string.translate(table)

#移除汉子和特殊字符
def removeChnAndCharacter(str1):
    C_pun = u'，。！？【】（）《》“‘'
    strTmp = ''

    if not isinstance(str1,str):
        return strTmp

    for i in range(len(str1)):
        #中文字符范围
        #https://blog.youkuaiyun.com/qq_22520587/article/details/62454354
        if str1[i] >= u'\u4e00' and str1[i] <= u'\u9fa5' \
                or str1[i] >= u'\u3300' and str1[i] <= u'\u33FF' \
                or str1[i] >= u'\u3200' and str1[i] <= u'\u32FF' \
                or str1[i] >= u'\u2700' and str1[i] <= u'\u27BF' \
                or str1[i] >= u'\u2600' and str1[i] <= u'\u26FF' \
                or str1[i] >= u'\uFE10' and str1[i] <= u'\uFE1F' \
                or str1[i] >= u'\u2E80' and str1[i] <= u'\u2EFF' \
                or str1[i] >= u'\u3000' and str1[i] <= u'\u303F' \
                or str1[i] >= u'\u31C0' and str1[i] <= u'\u31EF' \
                or str1[i] >= u'\u2FF0' and str1[i] <= u'\u2FFF' \
                or str1[i] >= u'\u3100' and str1[i] <= u'\u312F' \
                or str1[i] >= u'\u21A0' and str1[i] <= u'\u31BF' \
                :
            pass
        else:
            if str1[i] in C_pun:
                st = C_trans_to_E(str1[i])
            else:
                st = str1[i]
            strTmp += st

    return strTmp

多进程提速

def correctObject(csv_file,start,stop,step,savePth):
	#提取每一行，给新建文件逐一追加即可
	.......
	per_row.to_csv(savePth,mode='a',header= None)
	return
	
max_process = 10
if __name__ == '__main__':
    csv_file = pandas.read_csv('name.csv',encoding='utf-8',low_memory=False)

    print(type(csv_file))
    start = min(csv_file.index)
    stop = max(csv_file.index)
    step= csv_file.index[1] - csv_file.index[0]

    savePth = 'name_new.csv'
	
	#根据列头名称创建文件
    if not os.path.exists(savePth):
        per_row = csv_file[0:0]
        per_row.to_csv(savePth)


    p = multiprocessing.Pool(max_process)

    difScope = 10000
    p.apply_async(func=correctObject, args=(csv_file,start+difScope*0,start+difScope*1,step,savePth))
    p.apply_async(func=correctObject, args=(csv_file, start + difScope * 1, start + difScope * 2, step, savePth))

    p.apply_async(func=correctObject, args=(csv_file, start+difScope*2,start+difScope*3,step,savePth))
    p.apply_async(func=correctObject, args=(csv_file, start + difScope * 3, start + difScope * 4, step, savePth))

    p.apply_async(func=correctObject, args=(csv_file,start+difScope*4,start+difScope*5,step,savePth))
    p.apply_async(func=correctObject, args=(csv_file, start + difScope * 5, start + difScope * 6, step, savePth))

    p.apply_async(func=correctObject, args=(csv_file,start+difScope*6,start+difScope*7,step,savePth))
    p.apply_async(func=correctObject, args=(csv_file, start + difScope * 7, start + difScope * 8, step, savePth))

    p.apply_async(func=correctObject, args=(csv_file,start+difScope*8,start+difScope*9,step,savePth))
    p.apply_async(func=correctObject, args=(csv_file, start + difScope * 9, start + difScope * 10, step, savePth))

    p.close()
    p.join()

    print('Down')

删除drop

import pandas
csv_file = pandas.read_csv('data.csv',encoding='utf-8',low_memory=False)

table = csv_file.columns
#找出标签里含有Unnamed的名字，有则True，无则False
table_flag = table.str.contains('Unnamed')

#对Unname的列进行删除
for i in range(len(table)):
    if table_flag[i] :
    	#axis=1表示队列进行操作
    	#inplace=True替换原先文件
    	#labels列名
        csv_file.drop(labels=table[i],axis =1,inplace=True)
#index=False不在第0列显示排序号
csv_file.to_csv('sqlResult_2239794.csv',index = False)