using Pkg
Pkg.add(["DecisionTree", "SQLite", "DataFrames", "StatsBase", "Plots", "MLUtils", "MLJ", "MLJDecisionTreeInterface", "CategoricalArrays", "NearestNeighborModels", "XLSX", "Statistics"]);

   Resolving package versions...
     Project No packages added to or removed from `C:\Users\lblevins\.julia\environments\v1.12\Project.toml`
    Manifest No packages added to or removed from `C:\Users\lblevins\.julia\environments\v1.12\Manifest.toml`

using DecisionTree, SQLite, DataFrames, XLSX, StatsBase, Plots, Statistics, MLUtils, MLJ, MLJDecisionTreeInterface, CategoricalArrays

db = SQLite.DB("insurance.db")

query = """
   SELECT
        age, salary, number_of_kids, has_degree, is_married, sex, bought_insurance
    
    FROM
        life_policies
    
    WHERE
        country = 'england'
"""

df = DataFrame(DBInterface.execute(db, query))

SQLite.close(db)

num_rows = nrow(df)
println("Number of rows: ", num_rows)

first(df, 3)

Number of rows: 3505

function remove_outliers_iqr(df, col)
    x = df[!, col]
    q1 = quantile(x, 0.25)
    q3 = quantile(x, 0.75)
    iqr = q3 - q1

    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr

    return df[(x .>= lower) .& (x .<= upper), :]
end

df = remove_outliers_iqr(df, :salary)

num_rows_no_outliers = nrow(df)
println("Number of rows: ", num_rows_no_outliers, " (", num_rows-num_rows_no_outliers, " removed)")

first(df, 3)

Number of rows: 3504 (1 removed)

insurance_counts = countmap(df.bought_insurance)

insurance_df = DataFrame(
    bought_insurance = collect(keys(insurance_counts)),
    count = collect(values(insurance_counts))
)

bar(
    collect(keys(insurance_counts)),
    collect(values(insurance_counts)),
    legend = false,
    xticks = :auto,
    xlabel = "Purchased life insurance?",
    ylabel = "Count",
    title = "Portion of 30–65 year olds who purchased life insurance",
    titlefont = font(10),
    rotation = 0
)

function process_dataframe(df)
    column_mappings = Dict(
        :sex        => Dict("Female" => 0, "Male" => 1),
        :is_married => Dict("No" => 0, "Yes" => 1),
        :has_degree => Dict("No" => 0, "Yes" => 1)
    )

    for (col, mapping) in column_mappings
        coldata = df[!, col]
        df[!, col] = map(x -> mapping[x], coldata)   # produces Vector{Int}
    end

    df[!, :age] = Int.(df[!, :age])
    df[!, :salary] = Int.(df[!, :salary])
    df[!, :number_of_kids] = Int.(df[!, :number_of_kids])

    return df
end

df = process_dataframe(df);

first(df, 3)

target_variable = :bought_insurance
df = coerce(df, target_variable => Multiclass)

y, X = unpack(df, ==(target_variable));

train, test = partition(eachindex(y), 0.8, shuffle=true, rng=42);

# Load the model
RandomForestClassifier = @load RandomForestClassifier pkg=DecisionTree
rf = RandomForestClassifier()

# Define hyperparameters and fit models
ranges = [
    range(rf, :n_trees, values=[50, 100, 200]),
    range(rf, :max_depth, values=[5, 10, 20]),
    range(rf, :min_samples_leaf, values=[1, 2, 4])
]

tuned_rf = TunedModel(
    model = rf,
    tuning = Grid(),
    #resampling = Holdout(fraction_train=0.8),
    resampling = CV(nfolds=5),   # ← 5‑fold cross‑validation
    ranges = ranges,
    measure = accuracy
)

tuned_model = machine(tuned_rf, X, y)
MLJ.fit!(tuned_model)

# Predict using best model
y_pred = predict_mode(tuned_model, rows=test);

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\lblevins\.julia\packages\MLJModels\BfLy4\src\loading.jl:159

import MLJDecisionTreeInterface ✔

┌ Info: Training machine(ProbabilisticTunedModel(model = RandomForestClassifier(max_depth = -1, …), …), …).
└ @ MLJBase C:\Users\lblevins\.julia\packages\MLJBase\yVJvJ\src\machines.jl:499
┌ Info: Attempting to evaluate 27 models.
└ @ MLJTuning C:\Users\lblevins\.julia\packages\MLJTuning\xiLEY\src\tuned_models.jl:762
Evaluating over 27 metamodels: 100%[=========================] Time: 0:00:40

# Evaluate model
println("Best Random Forest Accuracy: ", accuracy(y_pred, y[test]))

# Output best parameters
best_model = fitted_params(tuned_model).best_model
println("Best parameters:")
println(MLJ.params(best_model))

Best Random Forest Accuracy: 0.9087018544935807
Best parameters:
(max_depth = 10, min_samples_leaf = 4, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = -1, n_trees = 100, sampling_fraction = 0.7, feature_importance = :impurity, rng = Random.TaskLocalRNG())
(max_depth = 10, min_samples_leaf = 4, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = -1, n_trees = 100, sampling_fraction = 0.7, feature_importance = :impurity, rng = Random.TaskLocalRNG())

importances = feature_importances(tuned_model)

sorted_importances = sort(importances, by=v->v[2], rev=true)

importance_df = DataFrame(
    Feature = [p[1] for p in sorted_importances],
    Importance = [p[2] for p in sorted_importances]
)

normalised = (importance_df.Importance .- minimum(importance_df.Importance)) ./ (maximum(importance_df.Importance) - minimum(importance_df.Importance)) # normalise importance values to [0, 1]

bar(
    String.(importance_df.Feature),
    importance_df.Importance,
    legend = false,
    xlabel = "Feature",
    ylabel = "Importance",
    title = "Feature Importances",
    rotation = 45,
    fillcolor = cgrad(:reds, rev=false)[normalised]
)

# Read Excel file
df_new = DataFrame(XLSX.readtable("data_to_predict_on.xlsx", "candidates"))

# Process dataframe
df_new = process_dataframe(df_new)

# Make predictions
predictions = MLJ.predict(tuned_model, df_new)

# Get prediction labels and add to df
pred_labels = mode.(predictions)
df_new[!, :predicted_insurance] = pred_labels;

# Get prediction confidence values and add to df
confidences = [pdf(p, mode(p)) for p in predictions]
df_new[!, :prediction_confidence] = confidences;

df_new

Row	Feature	Importance
	Symbol	Float64
1	number_of_kids	0.388225
2	age	0.300457
3	salary	0.224261
4	is_married	0.0571353
5	has_degree	0.0151233
6	sex	0.0147989

Row	age	salary	number_of_kids	has_degree	is_married	sex
	Int64	Int64	Int64	Int64	Int64	Int64
1	35	50100	0	1	0	1
2	39	55000	1	1	0	0
3	22	24001	0	0	0	1
4	62	45045	2	1	0	0
5	49	47500	3	1	1	0

Row	age	salary	number_of_kids	has_degree	is_married	sex	predicted_insurance	prediction_confidence
	Int64	Int64	Int64	Int64	Int64	Int64	Cat…	Float64
1	35	50100	0	1	0	1	No	1.0
2	39	55000	1	1	0	0	No	0.99
3	22	24001	0	0	0	1	No	1.0
4	62	45045	2	1	0	0	Yes	0.64
5	49	47500	3	1	1	0	Yes	0.9

Install and import packages¶

Import data¶

Remove outliers¶

See if target is (roughly) balanced¶

Process/encode data¶

Define target and feature vector¶

Split data into test and train datasets¶

Create and test Random Forest¶

Identify most important features¶

Predict on unseen data¶