CSV生成器
from collections import namedtuple
def csv_reader(fd, delimiter='\t'):
def gen():
for i in fd:
yield i.rstrip('\n').split(delimiter)
return gen()
def read_tsv(input_file, quotechar=None):
with open(input_file, 'r', encoding='utf8') as f:
reader = csv_reader(f)
headers = next(reader)
Example = namedtuple('Example', headers) # header
examples = []
for line in reader:
example = Example(*line)
examples.append(example)
return examples
print(read_tsv("49"))
输入
num type
16111 definition
20381 digit-money
输出
[Example(num='16111', type='definition'), Example(num='20381', type='digit-money')]
读5G文件
一、读取5G的csv文件没问题,但是内存就用了11G,仍然没办法进行数据分析的操作。
只能读取的话,
import dask.dataframe as dd
import os
from tqdm import tqdm
TRAIN_PATH = '../input/train.csv'
# Set columns to most suitable type to optimize for memory usage
traintypes = {'fare_amount': 'float32',
'pickup_datetime': 'str',
'pickup_longitude': 'float32',
'pickup_latitude': 'float32',
'dropoff_longitude': 'float32',
'dropoff_latitude': 'float32',
'passenger_count': 'uint8'
}
cols = list(traintypes.keys())
chunksize = 10_000_000 # 5 million rows at one go. Or try 10 million
%%time
df_list = []
for df_chunk in tqdm(pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize)):
df_list.append(df_chunk)
train_df = pd.concat(df_list)
del df_list
train_df.info()
二、dask
据说可以批处理,apply都能用,待尝试
--------------------------------------更新,已尝试
有一个比较致命的缺点,没办法重置索引。
但是我需要重置索引,所以目前失败,或许可以不重置
目前对于四个操作是秒完成的,train_df 包含5 000 000 + 条数据
(只是介绍一下操作,和dataframe很像,代码是无法运行的)
import pandas as pd
import numpy as np
import time
import datetime
import dask.dataframe as dd
train_df = dd.read_csv('../input/train.csv')
# drop
train_df = train_df.drop(['key','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1)
# apply
train_df['price'] = train_df.apply(lambda x: func(x.distance, x.fare_amount), axis = 1)
# 强转
train_df['week'] = train_df['week'].astype(int)
# 筛选
train_df = train_df[train_df['price'] > 0]