import pandas as pd
import numpy as np
df1=pd.DataFrame({'A':['a','b','c','d'],'B':['h','i','j','k'],'C':['o','p','q','r']})
df2=pd.DataFrame({'A':['a','b','c','d'],'B':['H','i','j','k'],'C':['o','p','q','R']})
display(df1,df2)
# 先找到任意一项不同的行索引
ne = (df1 != df2).any(axis=1)
ne
# 把列变成行
ne_stacked = (df1 != df2).stack()
ne_stacked
changed = ne_stacked[ne_stacked] # 我们可以看到哪些条目发生了更改
changed.index.names = ['id', 'col'] #设置索引名称 方便观察
changed
difference_locations = np.where(df1 != df2)
changed_from = df1.values[difference_locations]
changed_to = df2.values[difference_locations]
pd.DataFrame({'from': changed_from, 'to': changed_to}, index=changed.index)
*注意:重要的是df1和df2在这里共享相同的索引。为了克服这种模糊性,您可以确保只使用df1.index & df2.index查看共享标签,但我认为这将作为练习。
组装成函数
def compare_df(df1,df2):
cond=df1 != df2
ne_stacked = cond.stack()
changed = ne_stacked[ne_stacked]
changed.index.names = ['id', 'col']
difference_locations = np.where(cond)
changed_from = df1.values[difference_locations]
changed_to = df2.values[difference_locations]
return pd.DataFrame({'from': changed_from, 'to': changed_to}, index=changed.index)
compare_df(df1,df2)
翻译自:https://stackoverflow.com/questions/17095101/compare-two-dataframes-and-output-their-differences-side-by-side