CSV文件比较
import pandas as pd
def read_lines_from_file(file_name):
chunksize = 10 ** 6
record_map = {}
for chunk in pd.read_csv(file_name, chunksize=chunksize):
batch_list = chunk.values.tolist()
for item in batch_list:
record_map[item[0]] = item[1]
print(file_name+str(len(record_map)))
return record_map
def get_difference(record_map_small, record_map_bigger):
res = []
for item in record_map_small.items():
item_id = item[0]
value_from_small = item[1]
value_from_bigger = record_map_bigger.get(item_id, None)
if value_from_bigger is not None and value_from_small != value_from_bigger:
print (item_id, value_from_small, value_from_bigger)
res.append((item_id, value_from_small, value_from_bigger))
return res
def export_list_to_file(res, filename):
with open(filename, 'w') as f:
for item in res:
f.write(str(item[0])+","+item[1]+","+item[2]+'\n')
if __name__ == '__main__':
file_name = "/Users/.../test-app-deep.csv"
record_map_small = read_lines_from_file(file_name=file_name)
file_name = "/Users/.../part-00000.csv"
record_map_bigger = read_lines_from_file(file_name=file_name)
res = get_difference(record_map_small, record_map_bigger)
export_list_to_file(res, "/Users/..../test-app-dp-3.csv")