当要处理一个较大文件时,一次性加载到内存可能导致内存溢出,pandas提供了一种以迭代器的方式读取文件,我们可以手动设置每个批次要读取的数据条数chunkSize。
import pandas as pd
import chardet
# 检测文件编码
with open(r'data.txt', 'rb') as f:
encode = chardet.detect(f.readline())['encoding']
print(encode)
# 建议如果检测出编码为ascii 则采用utf-8编码
reader = pd.read_csv(r'data.txt', iterator=True, encoding=encode, sep='\t')
# type(reader) is <class 'pandas.io.parsers.TextFileReader'>
#reader = pd.read_csv(files_name,
# engine='python',
# sep=separation,
# encoding='utf-8',
# iterator=True,
# error_bad_lines=False
#)
# 每次读取的行数
chunkSize = 10000
chunks = []
while True:
try:
chunk = reader.get_chunk(chunkSize)
# type(chunk) is <class 'pandas.core.frame.DataFrame'>
# 需要注意的是文件的列名
# do something
chunks.append(chunk)
except StopIteration:
print("Iteration is stopped.")
break
df = pd.concat(chunks, ignore_index=True)
或者
reader = pd.read_csv(filename, dtype=str, chunksize=10000,
header=None, low_memory=False, sep=',')
for chunk in reader:
# do something
df = chunk
print(df.shape[0])
print(type(reader))
# <class 'pandas.io.parsers.TextFileReader'>
或者
手动计算一个批次大小
for i in range(0, df.shape[0], chunk_size):
_df = df.iloc[i: i + chunk_size].copy()
_df[col1_new] = _df[col1].swifter.apply(lambda pkg: fun(col))

1517

被折叠的 条评论
为什么被折叠?



