import pandas as pd

def compareDF(df1,df2,ColumnName, diff=''):
    
    leftList=[]
    rightList=[]
    for i in list(df1):
        leftList.append(str(i)+'_l')
    for i in list(df2):
        rightList.append(str(i)+'_r')
    
    
    df1.columns=leftList
    df1=df1.rename(columns={str(ColumnName)+"_l": ColumnName})
    df2.columns=rightList
    df2=df2.rename(columns={str(ColumnName)+"_r": ColumnName})
    df = pd.merge(df1,df2, on=ColumnName, suffixes=('', '') ,how='outer')
    df=df.fillna(0)
    df.sort_values(ColumnName, inplace=True)
    dropList=[]
    for index, row in df.iterrows():
        total=0
        for i in list(df):
            if i==ColumnName:
                continue
            else:
                total=row[i]+total
        if total==0:
            dropList.append(index)
    df=df.drop(dropList)
    df=df.reset_index(drop=True)
    
    #if diff=='x':
        
    
    return df

def formatX(c):  # Optional
    c=str(c)
    c=c.replace(',', '')
    c=c.replace('$','')
    return float(str(c))

def cleanDF(df, columnName): 
    df=df.fillna(0)
    for i in list(df):
        try:
            if i != columnName:
                df[i] = df.apply(lambda x: formatX(x[i]),axis=1)
        except Exception as e:
            continue
    return df

def getDiff(df, columnName):
    
    leftList=[]
    rightList=[]
    
    for i in list(df):
        if str(i)[-2:]=='_l':
            leftList.append(str(i)[:-2])
        elif str(i)[-2:]=='_r':
            rightList.append(str(i)[:-2])
    
    for i in leftList:
        if i not in rightList:
            df[str(i+'_r')]=0
    
    for i in rightList:
        if i not in leftList:
            df[str(i+'_l')]=0

    for i in list(df):
        try:
            if i != columnName and str(i)[-2:]!='_r':
                if str(i)[:-2]+'_r' in list(df):
                    df[str(i)[:-2]]=df[str(i)[:-2]+'_r']-df[i]
                else:
                    print("not there",str(i)[:-2]+'_r', list(df))
            elif '_r' in str(i):
                pass
                #df=df.drop([i], axis=1)
        except Exception as e:
            #print("Error:", str(e))
            continue
    
    for i in list(df):
        if str(i)[-2:]=='_l' or str(i)[-2:]=='_r':
            df=df.drop([i], axis=1)
        
    return df

df1 = pd.DataFrame(columns=["id","A", "B"], data=[[1,5,3], [2,3,5]])
df2 = pd.DataFrame(columns=["id","A", "B"], data=[[1,2,3], [2,3,6]])

df1=cleanDF(df1, 'id')
df2=cleanDF(df2, 'id')
result=compareDF(df1,df2,'id')

result.head()

# Right - Left
result2=getDiff(result, 'id')

not there _r ['id', 'A_l', 'B_l', 'A_r', 'B_r', 'A', 'B']
not there _r ['id', 'A_l', 'B_l', 'A_r', 'B_r', 'A', 'B']

result2.head()

df1 = pd.DataFrame(columns=["id","A", "B"], data=[[1,5,3], [2,3,5]])
df2 = pd.DataFrame(columns=["id","A", "B", "C"], data=[[1,2,3,4], [2,3,6,2], [12,13,16,1]])

df1=cleanDF(df1, 'id')
df2=cleanDF(df2, 'id')
result=compareDF(df2,df1,'id')

result.head()

result2=getDiff(result, 'id')

result2.head()

	id	A_l	B_l	A_r	B_r
0	1	5.0	3.0	2.0	3.0
1	2	3.0	5.0	3.0	6.0

	id	A	B
0	1	-3.0	0.0
1	2	0.0	1.0

	id	A_l	B_l	C_l	A_r	B_r
0	1	2.0	3.0	4.0	5.0	3.0
1	2	3.0	6.0	2.0	3.0	5.0
2	12	13.0	16.0	1.0	0.0	0.0

	id	A	B	C
0	1	3.0	0.0	-4.0
1	2	0.0	-1.0	-2.0
2	12	-13.0	-16.0	-1.0

Things about ML, DL, Data Science, AI

Thursday, July 8, 2021

Comparing two dimensionally different dataframes based on one mutual column