import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# Create 200 normally distributed values for 'salary' and 'age'
np.random.seed(41)
samples = 200
mean_salary = 70011
mean_age = 41
salaries = np.random.normal(loc=mean_salary, scale=mean_salary/4, size=samples).astype(int)
ages = np.random.normal(loc=mean_age, scale=mean_age/4, size=samples).astype(int)

# Create large outliers
index_1 = 2
index_2 = 3
salaries[index_1] = mean_salary*7 # third row outliers
salaries[index_2] = mean_salary*6; ages[index_2] = 101 # forth row outliers

# Add data into dataframe
df = pd.DataFrame({
    'name': [f'Employee_{i+1}' for i in range(samples)],
    'salary': salaries,
    'age': ages
})

# Display head of dataframe, highlighting large outliers
df.head(5).style.set_properties(**{'background-color': 'red'},subset=pd.IndexSlice[[index_1,index_2], :])

# Plot values on histograms to show obvious outliers
fig, axes = plt.subplots(1, 2, figsize=(9, 4))

axes[0].hist(df['salary'], bins=30, color='steelblue')
axes[0].axvline(df['salary'].median(), color='red', linestyle='--', linewidth=2)
axes[0].set_title('Salary distribution with median line')

axes[1].hist(df['age'], bins=30, color='salmon')
axes[1].axvline(df['age'].median(), color='red', linestyle='--', linewidth=2)
axes[1].set_title('Age distribution with median line')

plt.tight_layout()
plt.show()

# Compute Z-scores for each column
z_salary = np.abs((df['salary'] - df['salary'].mean()) / df['salary'].std())
z_age = np.abs((df['age'] - df['age'].mean()) / df['age'].std())

# Filter outliers (with a score of 3 or above) from dataframe
df_clean = df[(z_salary < 3) & (z_age < 3)]

# Display filtered-out rows
df_removed_rows = df[df.ne(df_clean).any(axis=1)]
df_removed_rows

# Compute salary IQR and outlier range
Q1_salary = df['salary'].quantile(0.25)
Q3_salary = df['salary'].quantile(0.75)
IQR_salary = Q3_salary - Q1_salary
lower_salary = Q1_salary - 1.5 * IQR_salary
upper_salary = Q3_salary + 1.5 * IQR_salary

# Compute age IQR and outlier range
Q1_age = df['age'].quantile(0.25)
Q3_age = df['age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
lower_age = Q1_age - 1.5 * IQR_age
upper_age = Q3_age + 1.5 * IQR_age

# Keep only rows where both salary and age are not outliers
df_clean = df[(df['salary'] >= lower_salary) & (df['salary'] <= upper_salary) & (df['age'] >= lower_age) & (df['age'] <= upper_age)]

# Display filtered-out rows
df_removed_rows = df[df.ne(df_clean).any(axis=1)]
df_removed_rows

isolation_forest = IsolationForest(contamination=0.02)

# Use isolation forest to identify outliers
df['outlier'] = isolation_forest.fit_predict(df[['salary', 'age']])

# Filter outliers from dataframe
df_clean = df[df['outlier'] == 1]

# Display filtered-out rows
df_removed_rows = df[df.ne(df_clean).any(axis=1)].drop(columns=['outlier'])
df_removed_rows

local_outlier_factor = LocalOutlierFactor(contamination=0.02)

# Use LOC to identify outliers
labels = local_outlier_factor.fit_predict(df[['salary', 'age']])

# Filter outliers from dataframe
df_clean = df[labels == 1]

# Display filtered-out rows
df_removed_rows = df[df.ne(df_clean).any(axis=1)].drop(columns=['outlier'])
df_removed_rows

	name	salary	age
0	Employee_1	65272	26
1	Employee_2	71846	47
2	Employee_3	490077	38
3	Employee_4	420066	101
4	Employee_5	79937	37

	name	salary	age
2	Employee_3	490077	38
3	Employee_4	420066	101

	name	salary	age
2	Employee_3	490077	38
3	Employee_4	420066	101
194	Employee_195	121894	21

	name	salary	age
2	Employee_3	490077	38
3	Employee_4	420066	101
77	Employee_78	20441	50
194	Employee_195	121894	21

	name	salary	age
2	Employee_3	490077	38
3	Employee_4	420066	101
81	Employee_82	113948	37
194	Employee_195	121894	21