rand_state = 7

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# Other modules/settings
from imblearn.over_sampling import RandomOverSampler
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

# Import English data
conn = sqlite3.connect('life_insurance.db')

df = pd.read_sql_query("""
    SELECT
        age, salary, number_of_kids, has_degree, is_married, sex, bought_insurance
    
    FROM
        life_policies
    
    WHERE
        country = 'england'
    """, conn)

conn.close

df.head(4)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2599 entries, 0 to 2598
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   age               2599 non-null   int64 
 1   salary            2599 non-null   int64 
 2   number_of_kids    2599 non-null   int64 
 3   has_degree        2599 non-null   object
 4   is_married        2599 non-null   object
 5   sex               2599 non-null   object
 6   bought_insurance  2599 non-null   object
dtypes: int64(3), object(4)
memory usage: 142.3+ KB

def transform_categoricals(df):
    column_mappings = {
        'sex': {'Female': 0, 'Male': 1},
        'is_married': {'No': 0, 'Yes': 1},
        'has_degree': {'No': 0, 'Yes': 1}
    }
    
    return df.replace(column_mappings)

df = transform_categoricals(df)

df.head(3)

remove_outliers_from = ['age'
                        ,'salary'
                        ,'number_of_kids'
                       ]

# Remove rows with values 3 standard deviations above/below the mean
for col in remove_outliers_from:
    mean = df[col].mean()
    std_dev = df[col].std()
    lower_cutoff = mean - 3 * std_dev
    upper_cutoff = mean + 3 * std_dev

    pre_removal = df.shape[0]
    
    df = df[(df[col] > lower_cutoff) & (df[col] < upper_cutoff)]

    post_removal = df.shape[0]

    if post_removal < pre_removal:
        print(str(pre_removal-post_removal) + " outlier value(s) identified in '" + col + "' removed.")

1 outlier value(s) identified in 'salary' removed.

target_variable = 'bought_insurance'

y = df[target_variable]
X = df.drop([target_variable], axis=1)

insurance_counts = y.value_counts()

insurance_counts.plot(kind='bar', rot=0, color=['green', 'red'])
plt.xlabel('Purchased life insurance?')
plt.ylabel('Count')
plt.title('Portion of 30-65 year olds who purchased life insurance')

plt.show()

OverSampler = RandomOverSampler(sampling_strategy =1)
X, y = OverSampler.fit_resample(X, y)
y.value_counts()

bought_insurance
No     1694
Yes    1694
Name: count, dtype: int64

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=rand_state)

X_test.head()

# Define parameters
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [7, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialise RandomForestClassifier and define parameters
RandomForrestC = RandomForestClassifier(random_state=rand_state)

# Initialise and fit grid search
grid_search = GridSearchCV(estimator=RandomForrestC
                           ,param_grid=param_grid
                           ,cv=5 # 5-fold cross-validation
                          )

grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Make predictions with the best estimator
y_prediction = best_estimator.predict(X_test)

def evaluate_model(y_test, y_prediction, best_params):
    score = round(accuracy_score(y_test, y_prediction),3)
    print('Model accuracy score is ' + str(score) + "\n")  

    print("Full classification report:\n" + classification_report(y_test, y_prediction))
        
    print("Best model is...\nMax depth: "
           + str(best_params['max_depth'])
           + ", Min samples split: "
           + str(best_params['min_samples_split'])
           + ", Number of estimators: "
           + str(best_params['n_estimators'])
          )

    return score

score1 = evaluate_model(y_test, y_prediction, best_params)

Model accuracy score is 0.895

Full classification report:
              precision    recall  f1-score   support

          No       0.93      0.86      0.89       575
         Yes       0.87      0.93      0.90       544

    accuracy                           0.89      1119
   macro avg       0.90      0.90      0.89      1119
weighted avg       0.90      0.89      0.89      1119

Best model is...
Max depth: 20, Min samples split: 2, Number of estimators: 500

feature_scores = pd.Series(best_estimator.feature_importances_, index=X_train.columns).sort_values(ascending=False)

plot = sns.barplot(x=feature_scores, y=feature_scores.index)
plt.xlabel('Feature importance score')
plt.ylabel('Feature')
plt.show()

worst_feature = feature_scores.sort_values(ascending=True).index[0]

X = X.drop([worst_feature], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = rand_state)

# Initialise new RandomForestClassifier and define parameters
RandomForrestC_2 = RandomForestClassifier(random_state=rand_state)

# Initialise and fit grid search
grid_search = GridSearchCV(estimator=RandomForrestC_2
                           ,param_grid=param_grid
                           ,cv=5 # 5-fold cross-validation
                          )

grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params_2 = grid_search.best_params_
best_estimator_2 = grid_search.best_estimator_

# Make predictions with the best estimator
y_prediction = best_estimator_2.predict(X_test)

score2 = evaluate_model(y_test, y_prediction, best_params_2)

Model accuracy score is 0.901

Full classification report:
              precision    recall  f1-score   support

          No       0.94      0.86      0.90       575
         Yes       0.87      0.94      0.90       544

    accuracy                           0.90      1119
   macro avg       0.90      0.90      0.90      1119
weighted avg       0.90      0.90      0.90      1119

Best model is...
Max depth: 20, Min samples split: 2, Number of estimators: 500

if score1 < score2:
    higher_lower = "higher"
    best_estimator = best_estimator_2
    remove_worst_predictor = True
    dump(RandomForrestC_2, 'rf_model.joblib')
else:
    higher_lower = "lower"
    dump(RandomForrestC, 'rf_model.joblib')

print('Retrained model accuracy score is ' + str(score2) + " (" + str(round(score2-score1, 3)) + " " + higher_lower + ")" )

Retrained model accuracy score is 0.901 (0.006 higher)

df_to_predict_on = pd.read_excel("data_to_predict_on.xlsx", sheet_name="candidates")

df_to_predict_on.head(3)

df_to_predict_on = transform_categoricals(df_to_predict_on)

if remove_worst_predictor:
    df_to_predict_on = df_to_predict_on.drop([worst_feature], axis=1)

df_to_predict_on.head(3)

predictions = best_estimator.predict(df_to_predict_on)

df_to_predict_on['predicted_outcome'] = predictions

df_to_predict_on

	age	salary	number_of_kids	has_degree	is_married	sex
2024	60	34039	2	0	1	0
3182	51	46302	4	0	1	0
1969	48	47502	1	1	1	1
1559	44	33797	5	0	1	0
87	33	82144	2	1	1	1

	age	salary	number_of_kids	has_degree
0	35	50100	0	1
1	39	55000	1	1
2	22	24001	0	0

In this notebook I use the RandomForestClassifier in Scikit-learn to predict if a person will buy life insurance.¶

Import packages and data¶

Print overview of dataframe¶

Encode data¶

Remove outliers¶

Define target and feature vector¶

Perform oversampling of minority class¶

Split data into test and train datasets¶

Create Random Forest and make predictions¶

Evaluate model¶

Identify most important features¶

Retrain/retest the model without least important feature and save model¶

Use model to predict on new data¶

	age	salary	number_of_kids	has_degree	is_married	sex	bought_insurance
0	35	95469	0	Yes	Yes	Female	No
1	42	23859	2	Yes	No	Female	No
2	36	45412	2	Yes	Yes	Male	No
3	46	40202	3	Yes	No	Male	No