source_data_name1 = 'cloud'
source_data_name2 = 'prem'

replace_null_like_values_with_0 = True
normlaise_case_trim_remove_special_chars = True
remove_named_columns = False


import pandas as pd
import numpy as np
import collections
import re
import pyperclip
import itertools
import os
from win32api import MessageBox
from datetime import datetime
import os
import shutil
import sqlite3
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

%autosave 0

Autosave disabled


str_current_datetime = str(datetime.now().strftime("%Y-%m-%d"))

output_folder = "Outputs/" + source_data_name1 + " vs " + source_data_name2 + " (" + str_current_datetime + ")"
try:
    shutil.rmtree(output_folder)
except:
    pass
os.makedirs(output_folder)

output_folder_column_names = output_folder + "/Column name comparison"
os.makedirs(output_folder_column_names)

output_folder_UI = output_folder + "/Unique identifiers missing in each"
os.makedirs(output_folder_UI)


conn = sqlite3.connect('Inputs/database.db')


sql1 = "SELECT * FROM trans_c"
df1 = pd.read_sql_query(sql1, conn)

df1 = df1.astype(str)
df1_records_with_dups = df1.shape[0]

print(source_data_name1 + " data has " + str(df1_records_with_dups) + " records and " + str(df1.shape[1]) + " columns")

cloud data has 200 records and 7 columns


sql2 = "SELECT * FROM trans_p"
df2 = pd.read_sql_query(sql2, conn)

df2 = df2.astype(str)
df2_records_with_dups = df2.shape[0]

print(source_data_name2 + " data has " + str(df2_records_with_dups) + " records and " + str(df2.shape[1]) + " columns")

prem data has 201 records and 7 columns


conn.close

<function Connection.close>


row_count_message = "The row count differs by " + str(abs(df1_records_with_dups - df2_records_with_dups))

row_count = output_folder + "/Row count " + source_data_name1 + " vs " + source_data_name2 + ".txt"
with open(row_count, "w") as f:
    f.write(row_count_message + " (before any processing/filtering)")
    f.write("\n" + source_data_name1 + " has " + str(df1_records_with_dups) + " rows\n") 
    f.write(source_data_name2 + " has " + str(df2_records_with_dups) + " row")     
    
print(row_count_message)

The row count differs by 1


if remove_named_columns:
    columns_to_delete = [
    'Broker'
    ]

    df1 = df1.drop(columns=[col for col in df1 if col in columns_to_delete])
    df2 = df2.drop(columns=[col for col in df2 if col in columns_to_delete])


df1_columns = set(df1.columns.values.tolist())
df2_columns = set(df2.columns.values.tolist())

df1_columns_dif = df1_columns.difference(df2_columns)
df2_columns_dif = df2_columns.difference(df1_columns)

if len(df1_columns) + len(df2_columns) == 0:
    print("These columns names appear in " + source_data_name1 + " only...")
    print(df1_columns_dif)

    print("\nThese columns names appear in " + source_data_name2 + " only...")
    print(df2_columns_dif)
else:
    print("The data sets have the same column names.")

print("\nThese columns names appear in both (and will be compared)...")
column_names_in_both = df2_columns.intersection(df1_columns)
print(column_names_in_both)

The data sets have the same column names.

These columns names appear in both (and will be compared)...
{'IsRenewal', 'PolicyId', 'TransactionDate', 'Amount', 'TransactionId', 'Broker', 'Currency'}


to_append = ''
if len(df1_columns_dif) == 0:
    to_append = ' (empty)'
    
output_file = output_folder_column_names + "/Columns in df1 only" + to_append + ".txt"

with open(output_file, 'w') as f:
    for item in df1_columns_dif:
        f.write("%s\n" % item)
        
to_append = ''
if len(df2_columns_dif) == 0:
    to_append = ' (empty)'

output_file = output_folder_column_names + "/Columns in df2 only" + to_append + ".txt"

with open(output_file, 'w') as f:
    for item in df2_columns_dif:
        f.write("%s\n" % item)
        
to_append = ''
if len(column_names_in_both) == 0:
    to_append = ' (empty)'
    
output_file = output_folder_column_names + "/Columns in " + source_data_name1 + " AND " + source_data_name2 + to_append + ".txt"

with open(output_file, 'w') as f:
    for item in column_names_in_both:
        f.write("%s\n" % item)


df1 = df1.drop_duplicates()
df1_records_no_dups = df1.shape[0]
d1_dups_removed = str(df1_records_with_dups - df1_records_no_dups)
print("Dataframe from " + source_data_name1 + " now has " + str(df1_records_no_dups) + " records (" + d1_dups_removed + " were removed) and " + str(df1.shape[1]) + " columns")

Dataframe from cloud now has 200 records (0 were removed) and 7 columns


df2 = df2.drop_duplicates()
df2_records_no_dups = df2.shape[0]
d2_dups_removed = str(df2_records_with_dups - df2_records_no_dups)
print("Dataframe from " + source_data_name2 + " now has " + str(df2_records_no_dups) + " records (" + d2_dups_removed + " were removed) and " + str(df2.shape[1]) + " columns")

Dataframe from prem now has 201 records (0 were removed) and 7 columns


df1 = df1.replace('\.0+$','',regex=True)
df2 = df2.replace('\.0+$','',regex=True)

if replace_null_like_values_with_0:
    df1 = df1.replace(['NaT','None','0', 'nan','NULL', ''],'0')
    df2 = df2.replace(['NaT','None','0', 'nan','NULL', ''],'0')
    
if normlaise_case_trim_remove_special_chars:
    df1 = df1.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
    df1 = df1.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
    df1 = df1.replace(re.compile("\-|'"), '', regex=True)
    df2 = df2.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
    df2 = df2.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
    df2 = df2.replace(re.compile("\-|'"), '', regex=True)


determine_unique = []

for col in df1.columns:
    uniques = df1[col].unique()
    total = len(df1[col])
    num_unique = len(uniques)
    pct_unique = (num_unique/total) * 100
    determine_unique.append(f"{pct_unique:.1f}% of values in '{col}' are unique ({num_unique} unique values out of {total})")

# Remove all items that do not contain a regex string (opposite of above)
regex = re.compile(r'100\.|9[0-9]\.')
determine_unique = [i for i in determine_unique if regex.search(i)]
    
print("These columns have at least 90% unique values...")    
determine_unique.sort()    
determine_unique

These columns have at least 90% unique values...

["100.0% of values in 'Amount' are unique (200 unique values out of 200)",
 "100.0% of values in 'TransactionId' are unique (200 unique values out of 200)"]


cleaned_list = []
for item in determine_unique:
    item = re.findall(r"in '([^']+)", item)
    cleaned_list.append(item)
cleaned_list = list(itertools.chain(*cleaned_list))
df1_examine = df1[cleaned_list]
df1_examine.head(3)


determine_unique = []

for col in df2.columns:
    uniques = df2[col].unique()
    total = len(df2[col])
    num_unique = len(uniques)
    pct_unique = (num_unique/total) * 100
    determine_unique.append(f"{pct_unique:.1f}% of values in '{col}' are unique ({num_unique} unique values out of {total})")

regex = re.compile(r'100\.|9[0-9]\.')
determine_unique = [i for i in determine_unique if regex.search(i)]
    
print("These columns have at least 90% unique values...")    
determine_unique.sort()    
determine_unique

These columns have at least 90% unique values...

["100.0% of values in 'Amount' are unique (201 unique values out of 201)",
 "100.0% of values in 'TransactionId' are unique (201 unique values out of 201)"]


cleaned_list = []
for item in determine_unique:
    item = re.findall(r"in '([^']+)", item)
    cleaned_list.append(item)
cleaned_list = list(itertools.chain(*cleaned_list))
df2_examine = df2[cleaned_list]
df2_examine.head(3)


# DEFINE UNIQUE IDENTIFIER COLUMN FOR df1
df1['UniqueIdentifier'] = df1['TransactionId']

# DEFINE UNIQUE IDENTIFIER COLUMN FOR df2
df2['UniqueIdentifier'] = df2['TransactionId']


dups1 = [item for item, count in collections.Counter(df1['UniqueIdentifier'].tolist()).items() if count > 1]
dups2 = [item for item, count in collections.Counter(df2['UniqueIdentifier'].tolist()).items() if count > 1]
dups = dups1 + dups2

df1 = df1[~df1['UniqueIdentifier'].isin(dups)]
df2 = df2[~df2['UniqueIdentifier'].isin(dups)]


df1_set = set(df1['UniqueIdentifier'])
df2_set = set(df2['UniqueIdentifier'])

# IDs only in df1
in_df1_only = df1_set.difference(df2_set)
print("Of the " + str(len(df1_set)) + " unique identifiers in " + source_data_name1 + " " + str(len(in_df1_only)) + " do not appear in " + source_data_name2)

# Output to .txt
try:
    output_file = output_folder_UI + "/Unique identifiers in " + source_data_name1 + " only (" + str(len(in_df1_only)) + " values).txt"
except:
    output_file = output_folder_UI + "/Unique identifiers in df1 only.txt"

with open(output_file, 'w') as f:
    for item in in_df1_only:
        f.write("%s\n" % item)

# IDs only in df1
in_df2_only = df2_set.difference(df1_set)
print("Of the " + str(len(df2_set)) + " unique identifiers in " + source_data_name2 + " " + str(len(in_df2_only)) + " do not appear in " + source_data_name1)

# Output to .txt
try:
    output_file = output_folder_UI + "/Unique identifiers in " + source_data_name2 + " only (" + str(len(in_df2_only)) + " values).txt"
except:
    output_file = output_folder_UI + "/Unique identifiers in in df2 only.txt"

with open(output_file, 'w') as f:
    for item in in_df2_only:
        f.write("%s\n" % item)

Of the 200 unique identifiers in cloud 3 do not appear in prem
Of the 201 unique identifiers in prem 4 do not appear in cloud


df1 = df1.add_suffix('_' + source_data_name1)
df2 = df2.add_suffix('_' + source_data_name2)

comparison_code = []

for item in column_names_in_both:
    code = "df_merged['" + item + "__IsDifferent'] = np.where((df_merged['" + item + '_' + source_data_name1 + "'] == df_merged['" + item + '_' + source_data_name2 + "']), 0, 1)"
    comparison_code.append(code)
    
comparison_code

["df_merged['IsRenewal__IsDifferent'] = np.where((df_merged['IsRenewal_cloud'] == df_merged['IsRenewal_prem']), 0, 1)",
 "df_merged['PolicyId__IsDifferent'] = np.where((df_merged['PolicyId_cloud'] == df_merged['PolicyId_prem']), 0, 1)",
 "df_merged['TransactionDate__IsDifferent'] = np.where((df_merged['TransactionDate_cloud'] == df_merged['TransactionDate_prem']), 0, 1)",
 "df_merged['Amount__IsDifferent'] = np.where((df_merged['Amount_cloud'] == df_merged['Amount_prem']), 0, 1)",
 "df_merged['TransactionId__IsDifferent'] = np.where((df_merged['TransactionId_cloud'] == df_merged['TransactionId_prem']), 0, 1)",
 "df_merged['Broker__IsDifferent'] = np.where((df_merged['Broker_cloud'] == df_merged['Broker_prem']), 0, 1)",
 "df_merged['Currency__IsDifferent'] = np.where((df_merged['Currency_cloud'] == df_merged['Currency_prem']), 0, 1)"]


df1.head(2)


df2.head(2)


UniqueIdentifier_col = next((col for col in df1.columns if 'UniqueIdentifier' in col), None)
df1['UniqueIdentifier'] = df1[UniqueIdentifier_col]

UniqueIdentifier_col = next((col for col in df2.columns if 'UniqueIdentifier' in col), None)
df2['UniqueIdentifier'] = df2[UniqueIdentifier_col]

df_merged = pd.merge(df1, df2, how="inner", on=["UniqueIdentifier"])
df_merged = df_merged.loc[:, ~df_merged.columns.str.startswith('UniqueIdentifier_')]


for single_comparison in comparison_code:
    exec(single_comparison)

df_merged = df_merged.sort_index(axis=1)

df_rows = df_merged.shape[0] # used for % of total discrpencies later

df_merged.head()


# Count rows that do not match and add this as a column
df_merged['NumberOfDiscrepancies'] = df_merged.filter(regex='IsDifferent').eq(1).sum(axis=1)

# Make above column the first (leftmost) column
first_column = df_merged.pop('NumberOfDiscrepancies')
df_merged.insert(0, 'NumberOfDiscrepancies', first_column)

# Make the unique column the first (leftmost) column
first_column = df_merged.pop('UniqueIdentifier')
df_merged.insert(0, 'UniqueIdentifier', first_column)

# Remove rows with no differences
df_merged_differences_only = df_merged[df_merged.NumberOfDiscrepancies>0]
df_merged_differences_only = df_merged_differences_only.sort_values(by=['NumberOfDiscrepancies'], ascending=False)

df_merged_differences_only.head(3)


print("df has " + str(df_merged_differences_only.shape[0]) + " rows (this many records contain at least 1 difference) and " + str(df_merged_differences_only.shape[1]) + " columns")

df has 127 rows (this many records contain at least 1 difference) and 23 columns


df_merged_differences_only = df_merged_differences_only.loc[:, ~df_merged_differences_only.columns.str.startswith('Unnamed: ')]

try:
    if replace_null_like_values_with_0:
        output_file = output_folder + "/" + source_data_name1 + " vs " + source_data_name2 + " differences (null fixed) - " + str_current_datetime + ".csv"
    else:
        output_file = output_folder + "/" + source_data_name1 + " vs " + source_data_name2 + " differences (null unfixed) - " + str_current_datetime + ".csv"
except:
    output_file = output_folder + "/Data differences.csv"

df_merged_differences_only.to_csv(output_file, index=False)


# Filter df to columns that contain __IsDifferent in column name
df_summary = df_merged_differences_only.loc[:, df_merged_differences_only.columns.str.endswith('__IsDifferent')]

# Sum every column
df_summary = pd.DataFrame(df_summary.sum()).T

# Transpose df
df_summary = df_summary.T

# Rename columns
df_summary = df_summary.reset_index()
df_summary.columns.values[0] = "Column"
df_summary.columns.values[1] = "Number of Discrepancies"
df_summary['Column'] = df_summary['Column'].str.replace('__IsDifferent','')

# Calculate percentage of rows where the columns show discrepencies
df_summary['Percent Of Total'] = round(df_summary['Number of Discrepancies']/df_rows*100,2)

# Sort df
df_summary= df_summary.sort_values(by=['Number of Discrepancies'], ascending=False)

# Output to CSV
try:
    if replace_null_like_values_with_0:
        output_file =  output_folder + "/" + source_data_name1 + " vs " + source_data_name2 + " summary (null fixed) - " + str_current_datetime + ".csv"
    else:
        output_file =  output_folder + "/" + source_data_name1 + " vs " + source_data_name2 + " summary (null unfixed) - " + str_current_datetime + ".csv"
except:
    output_file =  output_folder + "/Data summary.csv"
    
df_summary.to_csv(output_file, index=False)


df_summary


df_summary_size = df_summary.shape[0]

plot = df_summary.plot.bar(x='Column',
                    y='Percent Of Total',
                    title='Percentage of values that differ by column ({0} rows compared)'.format(df_rows),
                    xlabel="Column name (as appears in both data sources)", 
                    ylabel="% difference", 
                    figsize=(13,5),
                    rot=90
                   );

plot.set_ylim(0, 119.75)   

for i, v in enumerate(df_summary['Percent Of Total']):
    label = '{0:.2f}%'.format(v)

    if float(label.strip('%')) == 0.00:
        color = 'green'
    else:
        color = 'red'

    plot.text(i, v + 2, label, ha='center', rotation=90, color=color)

plot.figure.savefig(output_folder + "/Summary image - " + str_current_datetime +".png", bbox_inches='tight')


vba_to_run = r'''
Sub ApplyConditionalFormatting()
    Dim cell As Range
    Dim lastColumn As Long

    lastColumn = Cells(1, Columns.Count).End(xlToLeft).Column

    For Each cell In Range("A1").Resize(1, lastColumn)
        If InStr(cell.Value2, "__IsDifferent") > 0 Then
            With Range(cell.Offset(1), Cells(Rows.Count, cell.Column).End(xlUp))
                .FormatConditions.Add Type:=xlCellValue, Operator:=xlEqual, Formula1:="=1"
                .FormatConditions(.FormatConditions.Count).SetFirstPriority
                With .FormatConditions(1).Interior
                    .PatternColorIndex = xlAutomatic
                    .Color = 255
                    .TintAndShade = 0
                End With
            End With
        End If
    Next cell
End Sub
'''

try:
    sql_file = output_folder + "/SQL for data.txt"
    with open(sql_file, "w") as f:
        f.write(sql1 + "\n") 
        f.write(sql2)
except:
    pass

vba_file = output_folder + "/VBA to highlight differences.txt"
with open(vba_file, "w") as f:
    f.write("'Run the below VBA to highlight cells in the comparison.csv....\n\n" + vba_to_run)


shutil.copyfile('Data Comparison.ipynb', output_folder + '/Analysis file.ipynb')

'Outputs/cloud vs prem (2024-02-10)/Analysis file.ipynb'


MessageBox(0, "Some VBA code is now attached to your clipboard (which can be pasted). This code will also be exported to a .txt file. Running this code in the differences.csv spreadsheet will colour cells that marks differences (discrepenices) red.", "Complete", 0x00001000)

print(vba_to_run)
pyperclip.copy(vba_to_run)

Sub ApplyConditionalFormatting()
    Dim cell As Range
    Dim lastColumn As Long

    lastColumn = Cells(1, Columns.Count).End(xlToLeft).Column

    For Each cell In Range("A1").Resize(1, lastColumn)
        If InStr(cell.Value2, "__IsDifferent") > 0 Then
            With Range(cell.Offset(1), Cells(Rows.Count, cell.Column).End(xlUp))
                .FormatConditions.Add Type:=xlCellValue, Operator:=xlEqual, Formula1:="=1"
                .FormatConditions(.FormatConditions.Count).SetFirstPriority
                With .FormatConditions(1).Interior
                    .PatternColorIndex = xlAutomatic
                    .Color = 255
                    .TintAndShade = 0
                End With
            End With
        End If
    Next cell
End Sub


output_folder_contents = os.listdir(output_folder)

print("These FILES were created in the '" + output_folder + "' folder...\n")
for files in output_folder_contents:
    if '.' in files:
        print(files)
        
print("\n\nThese FOLDERS were created in the '" + output_folder + "' folder...\n")
for folder in output_folder_contents:
    if '.' not in folder:
        print(folder)

These FILES were created in the 'Outputs/cloud vs prem (2024-02-10)' folder...

Analysis file.ipynb
cloud vs prem differences (null fixed) - 2024-02-10.csv
cloud vs prem summary (null fixed) - 2024-02-10.csv
Row count cloud vs prem.txt
SQL for data.txt
Summary image - 2024-02-10.png
VBA to highlight differences.txt


These FOLDERS were created in the 'Outputs/cloud vs prem (2024-02-10)' folder...

Column name comparison
Unique identifiers missing in each

	Amount	TransactionId
0	47259	8028676
1	48114	1488580
2	19018	3095155

	Amount	TransactionId
0	47259	8028676
1	48114	1488580
2	19018	3095155

	TransactionId_cloud	Amount_cloud	Currency_cloud	PolicyId_cloud	TransactionDate_cloud	IsRenewal_cloud	Broker_cloud	UniqueIdentifier_cloud
0	8028676	47259	gbp	1621a	09/02/2023	1	james smith	8028676
1	1488580	48114	usd	1611a	09/02/2023	1	tom jarvis	1488580

	TransactionId_prem	Amount_prem	Currency_prem	PolicyId_prem	TransactionDate_prem	IsRenewal_prem	Broker_prem	UniqueIdentifier_prem
0	8028676	47259	usd	1621a	09/02/2023	1	james smith	8028676
1	1488580	48114	usd	1611a	09/02/2023	1	tom jarvis	1488580

	Amount_cloud	Amount_prem	Broker_cloud	Broker_prem	Currency__IsDifferent	Currency_cloud	Currency_prem	IsRenewal_cloud	IsRenewal_prem	PolicyId__IsDifferent	PolicyId_cloud	PolicyId_prem	TransactionDate_cloud	TransactionDate_prem	TransactionId_cloud	TransactionId_prem	UniqueIdentifier
0	47259	47259	james smith	james smith	1	gbp	usd	1	1	0	1621a	1621a	09/02/2023	09/02/2023	8028676	8028676	8028676
1	48114	48114	tom jarvis	tom jarvis	0	usd	usd	1	1	0	1611a	1611a	09/02/2023	09/02/2023	1488580	1488580	1488580
2	19018	19018	martin jones	martin jones	0	usd	usd	0	0	0	1604a	1604a	10/02/2023	10/02/2023	3095155	3095155	3095155
3	28290	28290	martin jones	martin jones	1	gbp	usd	1	1	1	1607b	1607x	10/02/2023	10/02/2023	1263001	1263001	1263001
4	34128	34128	tom jarvis	tom jarvis	1	gbp	usd	0	0	0	1623a	1623a	09/02/2023	09/02/2023	2514622	2514622	2514622

Name data sources and configure settings¶

Import models¶

Make output folder (with timestamp)¶

Import data (into df1 and df2)¶

Export row count text file¶

Drop columns that are not of interest (optional)

Perform column name comparison¶

Remove duplicate rows (if they exist)¶

Replace trailing 0s and (optionally) clean data¶

Indentify unique/almost unique identifiers (for join)¶

df1¶

df2¶

Define joining key¶

Remove unique identifiers/almost unique identifers from dfs if not unique¶

Find which unique identifiers appear in which datasets¶

Generate code for comparisons¶

Inner join dataframes¶

Add comparion columns¶

Remove rows where all columns match, rearrange columns¶

Export main comparison output file¶

Create and export summary output file¶

Export chart¶

Export SQL and VBA text files¶

Copy this notebook into output folder (for audit purposes)¶

Print list of outputs and VBA message¶

	UniqueIdentifier	NumberOfDiscrepancies	Amount_cloud	Amount_prem	Broker__IsDifferent	Broker_cloud	Broker_prem	Currency__IsDifferent	Currency_cloud	Currency_prem	IsRenewal_cloud	IsRenewal_prem	PolicyId__IsDifferent	PolicyId_cloud	PolicyId_prem	TransactionDate__IsDifferent	TransactionDate_cloud	TransactionDate_prem	TransactionId_cloud	TransactionId_prem
134	8227058	3	4625	4625	0	tom jarvis	tom jarvis	1	gbp	usd	1	1	1	1625b	1625x	1	11/02/2023	09/02/2023	8227058	8227058
90	2754026	3	28041	28041	1	martin jones	james smith	1	gbp	usd	1	1	1	1622b	1622x	0	10/02/2023	10/02/2023	2754026	2754026
91	6119692	3	24656	24656	1	tom jarvis	james smith	1	gbp	usd	0	0	1	1604b	1604x	0	11/02/2023	11/02/2023	6119692	6119692

	Column	Number of Discrepancies	Percent Of Total
4	PolicyId	79	40.10
2	Currency	73	37.06
5	TransactionDate	12	6.09
1	Broker	9	4.57
3	IsRenewal	3	1.52
0	Amount	0	0.00
6	TransactionId	0	0.00