This Python notebook compares two database tables and finds differences between them. The code highlights the following:
The outputs (listed after the print statement in the final cell) include CSVs, text files, and a summary image.
source_data_name1 = 'cloud'
source_data_name2 = 'prem'
replace_null_like_values_with_0 = True
normlaise_case_trim_remove_special_chars = True
remove_named_columns = False
import pandas as pd
import numpy as np
import collections
import re
import pyperclip
import itertools
import os
from win32api import MessageBox
from datetime import datetime
import os
import shutil
import sqlite3
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
%autosave 0
Autosave disabled
str_current_datetime = str(datetime.now().strftime("%Y-%m-%d"))
output_folder = "Outputs/" + source_data_name1 + " vs " + source_data_name2 + " (" + str_current_datetime + ")"
try:
shutil.rmtree(output_folder)
except:
pass
os.makedirs(output_folder)
output_folder_column_names = output_folder + "/Column name comparison"
os.makedirs(output_folder_column_names)
output_folder_UI = output_folder + "/Unique identifiers missing in each"
os.makedirs(output_folder_UI)
conn = sqlite3.connect('Inputs/database.db')
sql1 = "SELECT * FROM trans_c"
df1 = pd.read_sql_query(sql1, conn)
df1 = df1.astype(str)
df1_records_with_dups = df1.shape[0]
print(source_data_name1 + " data has " + str(df1_records_with_dups) + " records and " + str(df1.shape[1]) + " columns")
cloud data has 200 records and 7 columns
sql2 = "SELECT * FROM trans_p"
df2 = pd.read_sql_query(sql2, conn)
df2 = df2.astype(str)
df2_records_with_dups = df2.shape[0]
print(source_data_name2 + " data has " + str(df2_records_with_dups) + " records and " + str(df2.shape[1]) + " columns")
prem data has 201 records and 7 columns
conn.close
<function Connection.close>
row_count_message = "The row count differs by " + str(abs(df1_records_with_dups - df2_records_with_dups))
row_count = output_folder + "/Row count " + source_data_name1 + " vs " + source_data_name2 + ".txt"
with open(row_count, "w") as f:
f.write(row_count_message + " (before any processing/filtering)")
f.write("\n" + source_data_name1 + " has " + str(df1_records_with_dups) + " rows\n")
f.write(source_data_name2 + " has " + str(df2_records_with_dups) + " row")
print(row_count_message)
The row count differs by 1
if remove_named_columns:
columns_to_delete = [
'Broker'
]
df1 = df1.drop(columns=[col for col in df1 if col in columns_to_delete])
df2 = df2.drop(columns=[col for col in df2 if col in columns_to_delete])
df1_columns = set(df1.columns.values.tolist())
df2_columns = set(df2.columns.values.tolist())
df1_columns_dif = df1_columns.difference(df2_columns)
df2_columns_dif = df2_columns.difference(df1_columns)
if len(df1_columns) + len(df2_columns) == 0:
print("These columns names appear in " + source_data_name1 + " only...")
print(df1_columns_dif)
print("\nThese columns names appear in " + source_data_name2 + " only...")
print(df2_columns_dif)
else:
print("The data sets have the same column names.")
print("\nThese columns names appear in both (and will be compared)...")
column_names_in_both = df2_columns.intersection(df1_columns)
print(column_names_in_both)
The data sets have the same column names.
These columns names appear in both (and will be compared)...
{'IsRenewal', 'PolicyId', 'TransactionDate', 'Amount', 'TransactionId', 'Broker', 'Currency'}
to_append = ''
if len(df1_columns_dif) == 0:
to_append = ' (empty)'
output_file = output_folder_column_names + "/Columns in df1 only" + to_append + ".txt"
with open(output_file, 'w') as f:
for item in df1_columns_dif:
f.write("%s\n" % item)
to_append = ''
if len(df2_columns_dif) == 0:
to_append = ' (empty)'
output_file = output_folder_column_names + "/Columns in df2 only" + to_append + ".txt"
with open(output_file, 'w') as f:
for item in df2_columns_dif:
f.write("%s\n" % item)
to_append = ''
if len(column_names_in_both) == 0:
to_append = ' (empty)'
output_file = output_folder_column_names + "/Columns in " + source_data_name1 + " AND " + source_data_name2 + to_append + ".txt"
with open(output_file, 'w') as f:
for item in column_names_in_both:
f.write("%s\n" % item)
df1 = df1.drop_duplicates()
df1_records_no_dups = df1.shape[0]
d1_dups_removed = str(df1_records_with_dups - df1_records_no_dups)
print("Dataframe from " + source_data_name1 + " now has " + str(df1_records_no_dups) + " records (" + d1_dups_removed + " were removed) and " + str(df1.shape[1]) + " columns")
Dataframe from cloud now has 200 records (0 were removed) and 7 columns
df2 = df2.drop_duplicates()
df2_records_no_dups = df2.shape[0]
d2_dups_removed = str(df2_records_with_dups - df2_records_no_dups)
print("Dataframe from " + source_data_name2 + " now has " + str(df2_records_no_dups) + " records (" + d2_dups_removed + " were removed) and " + str(df2.shape[1]) + " columns")
Dataframe from prem now has 201 records (0 were removed) and 7 columns
df1 = df1.replace('\.0+$','',regex=True)
df2 = df2.replace('\.0+$','',regex=True)
if replace_null_like_values_with_0:
df1 = df1.replace(['NaT','None','0', 'nan','NULL', ''],'0')
df2 = df2.replace(['NaT','None','0', 'nan','NULL', ''],'0')
if normlaise_case_trim_remove_special_chars:
df1 = df1.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
df1 = df1.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
df1 = df1.replace(re.compile("\-|'"), '', regex=True)
df2 = df2.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
df2 = df2.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
df2 = df2.replace(re.compile("\-|'"), '', regex=True)
determine_unique = []
for col in df1.columns:
uniques = df1[col].unique()
total = len(df1[col])
num_unique = len(uniques)
pct_unique = (num_unique/total) * 100
determine_unique.append(f"{pct_unique:.1f}% of values in '{col}' are unique ({num_unique} unique values out of {total})")
# Remove all items that do not contain a regex string (opposite of above)
regex = re.compile(r'100\.|9[0-9]\.')
determine_unique = [i for i in determine_unique if regex.search(i)]
print("These columns have at least 90% unique values...")
determine_unique.sort()
determine_unique
These columns have at least 90% unique values...
["100.0% of values in 'Amount' are unique (200 unique values out of 200)", "100.0% of values in 'TransactionId' are unique (200 unique values out of 200)"]
cleaned_list = []
for item in determine_unique:
item = re.findall(r"in '([^']+)", item)
cleaned_list.append(item)
cleaned_list = list(itertools.chain(*cleaned_list))
df1_examine = df1[cleaned_list]
df1_examine.head(3)
| Amount | TransactionId | |
|---|---|---|
| 0 | 47259 | 8028676 |
| 1 | 48114 | 1488580 |
| 2 | 19018 | 3095155 |
determine_unique = []
for col in df2.columns:
uniques = df2[col].unique()
total = len(df2[col])
num_unique = len(uniques)
pct_unique = (num_unique/total) * 100
determine_unique.append(f"{pct_unique:.1f}% of values in '{col}' are unique ({num_unique} unique values out of {total})")
regex = re.compile(r'100\.|9[0-9]\.')
determine_unique = [i for i in determine_unique if regex.search(i)]
print("These columns have at least 90% unique values...")
determine_unique.sort()
determine_unique
These columns have at least 90% unique values...
["100.0% of values in 'Amount' are unique (201 unique values out of 201)", "100.0% of values in 'TransactionId' are unique (201 unique values out of 201)"]
cleaned_list = []
for item in determine_unique:
item = re.findall(r"in '([^']+)", item)
cleaned_list.append(item)
cleaned_list = list(itertools.chain(*cleaned_list))
df2_examine = df2[cleaned_list]
df2_examine.head(3)
| Amount | TransactionId | |
|---|---|---|
| 0 | 47259 | 8028676 |
| 1 | 48114 | 1488580 |
| 2 | 19018 | 3095155 |
# DEFINE UNIQUE IDENTIFIER COLUMN FOR df1
df1['UniqueIdentifier'] = df1['TransactionId']
# DEFINE UNIQUE IDENTIFIER COLUMN FOR df2
df2['UniqueIdentifier'] = df2['TransactionId']
dups1 = [item for item, count in collections.Counter(df1['UniqueIdentifier'].tolist()).items() if count > 1]
dups2 = [item for item, count in collections.Counter(df2['UniqueIdentifier'].tolist()).items() if count > 1]
dups = dups1 + dups2
df1 = df1[~df1['UniqueIdentifier'].isin(dups)]
df2 = df2[~df2['UniqueIdentifier'].isin(dups)]
df1_set = set(df1['UniqueIdentifier'])
df2_set = set(df2['UniqueIdentifier'])
# IDs only in df1
in_df1_only = df1_set.difference(df2_set)
print("Of the " + str(len(df1_set)) + " unique identifiers in " + source_data_name1 + " " + str(len(in_df1_only)) + " do not appear in " + source_data_name2)
# Output to .txt
try:
output_file = output_folder_UI + "/Unique identifiers in " + source_data_name1 + " only (" + str(len(in_df1_only)) + " values).txt"
except:
output_file = output_folder_UI + "/Unique identifiers in df1 only.txt"
with open(output_file, 'w') as f:
for item in in_df1_only:
f.write("%s\n" % item)
# IDs only in df1
in_df2_only = df2_set.difference(df1_set)
print("Of the " + str(len(df2_set)) + " unique identifiers in " + source_data_name2 + " " + str(len(in_df2_only)) + " do not appear in " + source_data_name1)
# Output to .txt
try:
output_file = output_folder_UI + "/Unique identifiers in " + source_data_name2 + " only (" + str(len(in_df2_only)) + " values).txt"
except:
output_file = output_folder_UI + "/Unique identifiers in in df2 only.txt"
with open(output_file, 'w') as f:
for item in in_df2_only:
f.write("%s\n" % item)
Of the 200 unique identifiers in cloud 3 do not appear in prem Of the 201 unique identifiers in prem 4 do not appear in cloud
df1 = df1.add_suffix('_' + source_data_name1)
df2 = df2.add_suffix('_' + source_data_name2)
comparison_code = []
for item in column_names_in_both:
code = "df_merged['" + item + "__IsDifferent'] = np.where((df_merged['" + item + '_' + source_data_name1 + "'] == df_merged['" + item + '_' + source_data_name2 + "']), 0, 1)"
comparison_code.append(code)
comparison_code
["df_merged['IsRenewal__IsDifferent'] = np.where((df_merged['IsRenewal_cloud'] == df_merged['IsRenewal_prem']), 0, 1)", "df_merged['PolicyId__IsDifferent'] = np.where((df_merged['PolicyId_cloud'] == df_merged['PolicyId_prem']), 0, 1)", "df_merged['TransactionDate__IsDifferent'] = np.where((df_merged['TransactionDate_cloud'] == df_merged['TransactionDate_prem']), 0, 1)", "df_merged['Amount__IsDifferent'] = np.where((df_merged['Amount_cloud'] == df_merged['Amount_prem']), 0, 1)", "df_merged['TransactionId__IsDifferent'] = np.where((df_merged['TransactionId_cloud'] == df_merged['TransactionId_prem']), 0, 1)", "df_merged['Broker__IsDifferent'] = np.where((df_merged['Broker_cloud'] == df_merged['Broker_prem']), 0, 1)", "df_merged['Currency__IsDifferent'] = np.where((df_merged['Currency_cloud'] == df_merged['Currency_prem']), 0, 1)"]
df1.head(2)
| TransactionId_cloud | Amount_cloud | Currency_cloud | PolicyId_cloud | TransactionDate_cloud | IsRenewal_cloud | Broker_cloud | UniqueIdentifier_cloud | |
|---|---|---|---|---|---|---|---|---|
| 0 | 8028676 | 47259 | gbp | 1621a | 09/02/2023 | 1 | james smith | 8028676 |
| 1 | 1488580 | 48114 | usd | 1611a | 09/02/2023 | 1 | tom jarvis | 1488580 |
df2.head(2)
| TransactionId_prem | Amount_prem | Currency_prem | PolicyId_prem | TransactionDate_prem | IsRenewal_prem | Broker_prem | UniqueIdentifier_prem | |
|---|---|---|---|---|---|---|---|---|
| 0 | 8028676 | 47259 | usd | 1621a | 09/02/2023 | 1 | james smith | 8028676 |
| 1 | 1488580 | 48114 | usd | 1611a | 09/02/2023 | 1 | tom jarvis | 1488580 |
UniqueIdentifier_col = next((col for col in df1.columns if 'UniqueIdentifier' in col), None)
df1['UniqueIdentifier'] = df1[UniqueIdentifier_col]
UniqueIdentifier_col = next((col for col in df2.columns if 'UniqueIdentifier' in col), None)
df2['UniqueIdentifier'] = df2[UniqueIdentifier_col]
df_merged = pd.merge(df1, df2, how="inner", on=["UniqueIdentifier"])
df_merged = df_merged.loc[:, ~df_merged.columns.str.startswith('UniqueIdentifier_')]
for single_comparison in comparison_code:
exec(single_comparison)
df_merged = df_merged.sort_index(axis=1)
df_rows = df_merged.shape[0] # used for % of total discrpencies later
df_merged.head()
| Amount__IsDifferent | Amount_cloud | Amount_prem | Broker__IsDifferent | Broker_cloud | Broker_prem | Currency__IsDifferent | Currency_cloud | Currency_prem | IsRenewal__IsDifferent | IsRenewal_cloud | IsRenewal_prem | PolicyId__IsDifferent | PolicyId_cloud | PolicyId_prem | TransactionDate__IsDifferent | TransactionDate_cloud | TransactionDate_prem | TransactionId__IsDifferent | TransactionId_cloud | TransactionId_prem | UniqueIdentifier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 47259 | 47259 | 0 | james smith | james smith | 1 | gbp | usd | 0 | 1 | 1 | 0 | 1621a | 1621a | 0 | 09/02/2023 | 09/02/2023 | 0 | 8028676 | 8028676 | 8028676 |
| 1 | 0 | 48114 | 48114 | 0 | tom jarvis | tom jarvis | 0 | usd | usd | 0 | 1 | 1 | 0 | 1611a | 1611a | 0 | 09/02/2023 | 09/02/2023 | 0 | 1488580 | 1488580 | 1488580 |
| 2 | 0 | 19018 | 19018 | 0 | martin jones | martin jones | 0 | usd | usd | 0 | 0 | 0 | 0 | 1604a | 1604a | 0 | 10/02/2023 | 10/02/2023 | 0 | 3095155 | 3095155 | 3095155 |
| 3 | 0 | 28290 | 28290 | 0 | martin jones | martin jones | 1 | gbp | usd | 0 | 1 | 1 | 1 | 1607b | 1607x | 0 | 10/02/2023 | 10/02/2023 | 0 | 1263001 | 1263001 | 1263001 |
| 4 | 0 | 34128 | 34128 | 0 | tom jarvis | tom jarvis | 1 | gbp | usd | 0 | 0 | 0 | 0 | 1623a | 1623a | 0 | 09/02/2023 | 09/02/2023 | 0 | 2514622 | 2514622 | 2514622 |
# Count rows that do not match and add this as a column
df_merged['NumberOfDiscrepancies'] = df_merged.filter(regex='IsDifferent').eq(1).sum(axis=1)
# Make above column the first (leftmost) column
first_column = df_merged.pop('NumberOfDiscrepancies')
df_merged.insert(0, 'NumberOfDiscrepancies', first_column)
# Make the unique column the first (leftmost) column
first_column = df_merged.pop('UniqueIdentifier')
df_merged.insert(0, 'UniqueIdentifier', first_column)
# Remove rows with no differences
df_merged_differences_only = df_merged[df_merged.NumberOfDiscrepancies>0]
df_merged_differences_only = df_merged_differences_only.sort_values(by=['NumberOfDiscrepancies'], ascending=False)
df_merged_differences_only.head(3)
| UniqueIdentifier | NumberOfDiscrepancies | Amount__IsDifferent | Amount_cloud | Amount_prem | Broker__IsDifferent | Broker_cloud | Broker_prem | Currency__IsDifferent | Currency_cloud | Currency_prem | IsRenewal__IsDifferent | IsRenewal_cloud | IsRenewal_prem | PolicyId__IsDifferent | PolicyId_cloud | PolicyId_prem | TransactionDate__IsDifferent | TransactionDate_cloud | TransactionDate_prem | TransactionId__IsDifferent | TransactionId_cloud | TransactionId_prem | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 134 | 8227058 | 3 | 0 | 4625 | 4625 | 0 | tom jarvis | tom jarvis | 1 | gbp | usd | 0 | 1 | 1 | 1 | 1625b | 1625x | 1 | 11/02/2023 | 09/02/2023 | 0 | 8227058 | 8227058 |
| 90 | 2754026 | 3 | 0 | 28041 | 28041 | 1 | martin jones | james smith | 1 | gbp | usd | 0 | 1 | 1 | 1 | 1622b | 1622x | 0 | 10/02/2023 | 10/02/2023 | 0 | 2754026 | 2754026 |
| 91 | 6119692 | 3 | 0 | 24656 | 24656 | 1 | tom jarvis | james smith | 1 | gbp | usd | 0 | 0 | 0 | 1 | 1604b | 1604x | 0 | 11/02/2023 | 11/02/2023 | 0 | 6119692 | 6119692 |
print("df has " + str(df_merged_differences_only.shape[0]) + " rows (this many records contain at least 1 difference) and " + str(df_merged_differences_only.shape[1]) + " columns")
df has 127 rows (this many records contain at least 1 difference) and 23 columns
df_merged_differences_only = df_merged_differences_only.loc[:, ~df_merged_differences_only.columns.str.startswith('Unnamed: ')]
try:
if replace_null_like_values_with_0:
output_file = output_folder + "/" + source_data_name1 + " vs " + source_data_name2 + " differences (null fixed) - " + str_current_datetime + ".csv"
else:
output_file = output_folder + "/" + source_data_name1 + " vs " + source_data_name2 + " differences (null unfixed) - " + str_current_datetime + ".csv"
except:
output_file = output_folder + "/Data differences.csv"
df_merged_differences_only.to_csv(output_file, index=False)
# Filter df to columns that contain __IsDifferent in column name
df_summary = df_merged_differences_only.loc[:, df_merged_differences_only.columns.str.endswith('__IsDifferent')]
# Sum every column
df_summary = pd.DataFrame(df_summary.sum()).T
# Transpose df
df_summary = df_summary.T
# Rename columns
df_summary = df_summary.reset_index()
df_summary.columns.values[0] = "Column"
df_summary.columns.values[1] = "Number of Discrepancies"
df_summary['Column'] = df_summary['Column'].str.replace('__IsDifferent','')
# Calculate percentage of rows where the columns show discrepencies
df_summary['Percent Of Total'] = round(df_summary['Number of Discrepancies']/df_rows*100,2)
# Sort df
df_summary= df_summary.sort_values(by=['Number of Discrepancies'], ascending=False)
# Output to CSV
try:
if replace_null_like_values_with_0:
output_file = output_folder + "/" + source_data_name1 + " vs " + source_data_name2 + " summary (null fixed) - " + str_current_datetime + ".csv"
else:
output_file = output_folder + "/" + source_data_name1 + " vs " + source_data_name2 + " summary (null unfixed) - " + str_current_datetime + ".csv"
except:
output_file = output_folder + "/Data summary.csv"
df_summary.to_csv(output_file, index=False)
df_summary
| Column | Number of Discrepancies | Percent Of Total | |
|---|---|---|---|
| 4 | PolicyId | 79 | 40.10 |
| 2 | Currency | 73 | 37.06 |
| 5 | TransactionDate | 12 | 6.09 |
| 1 | Broker | 9 | 4.57 |
| 3 | IsRenewal | 3 | 1.52 |
| 0 | Amount | 0 | 0.00 |
| 6 | TransactionId | 0 | 0.00 |
df_summary_size = df_summary.shape[0]
plot = df_summary.plot.bar(x='Column',
y='Percent Of Total',
title='Percentage of values that differ by column ({0} rows compared)'.format(df_rows),
xlabel="Column name (as appears in both data sources)",
ylabel="% difference",
figsize=(13,5),
rot=90
);
plot.set_ylim(0, 119.75)
for i, v in enumerate(df_summary['Percent Of Total']):
label = '{0:.2f}%'.format(v)
if float(label.strip('%')) == 0.00:
color = 'green'
else:
color = 'red'
plot.text(i, v + 2, label, ha='center', rotation=90, color=color)
plot.figure.savefig(output_folder + "/Summary image - " + str_current_datetime +".png", bbox_inches='tight')
vba_to_run = r'''
Sub ApplyConditionalFormatting()
Dim cell As Range
Dim lastColumn As Long
lastColumn = Cells(1, Columns.Count).End(xlToLeft).Column
For Each cell In Range("A1").Resize(1, lastColumn)
If InStr(cell.Value2, "__IsDifferent") > 0 Then
With Range(cell.Offset(1), Cells(Rows.Count, cell.Column).End(xlUp))
.FormatConditions.Add Type:=xlCellValue, Operator:=xlEqual, Formula1:="=1"
.FormatConditions(.FormatConditions.Count).SetFirstPriority
With .FormatConditions(1).Interior
.PatternColorIndex = xlAutomatic
.Color = 255
.TintAndShade = 0
End With
End With
End If
Next cell
End Sub
'''
try:
sql_file = output_folder + "/SQL for data.txt"
with open(sql_file, "w") as f:
f.write(sql1 + "\n")
f.write(sql2)
except:
pass
vba_file = output_folder + "/VBA to highlight differences.txt"
with open(vba_file, "w") as f:
f.write("'Run the below VBA to highlight cells in the comparison.csv....\n\n" + vba_to_run)
shutil.copyfile('Data Comparison.ipynb', output_folder + '/Analysis file.ipynb')
'Outputs/cloud vs prem (2024-02-10)/Analysis file.ipynb'
MessageBox(0, "Some VBA code is now attached to your clipboard (which can be pasted). This code will also be exported to a .txt file. Running this code in the differences.csv spreadsheet will colour cells that marks differences (discrepenices) red.", "Complete", 0x00001000)
print(vba_to_run)
pyperclip.copy(vba_to_run)
Sub ApplyConditionalFormatting()
Dim cell As Range
Dim lastColumn As Long
lastColumn = Cells(1, Columns.Count).End(xlToLeft).Column
For Each cell In Range("A1").Resize(1, lastColumn)
If InStr(cell.Value2, "__IsDifferent") > 0 Then
With Range(cell.Offset(1), Cells(Rows.Count, cell.Column).End(xlUp))
.FormatConditions.Add Type:=xlCellValue, Operator:=xlEqual, Formula1:="=1"
.FormatConditions(.FormatConditions.Count).SetFirstPriority
With .FormatConditions(1).Interior
.PatternColorIndex = xlAutomatic
.Color = 255
.TintAndShade = 0
End With
End With
End If
Next cell
End Sub
output_folder_contents = os.listdir(output_folder)
print("These FILES were created in the '" + output_folder + "' folder...\n")
for files in output_folder_contents:
if '.' in files:
print(files)
print("\n\nThese FOLDERS were created in the '" + output_folder + "' folder...\n")
for folder in output_folder_contents:
if '.' not in folder:
print(folder)
These FILES were created in the 'Outputs/cloud vs prem (2024-02-10)' folder... Analysis file.ipynb cloud vs prem differences (null fixed) - 2024-02-10.csv cloud vs prem summary (null fixed) - 2024-02-10.csv Row count cloud vs prem.txt SQL for data.txt Summary image - 2024-02-10.png VBA to highlight differences.txt These FOLDERS were created in the 'Outputs/cloud vs prem (2024-02-10)' folder... Column name comparison Unique identifiers missing in each