from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

df = pd.read_excel("salaries_and_age.xlsx")
df.head()

plt.scatter(df.Salary,df.Age)
plt.xlabel('Salary')
plt.ylabel('Age')

Text(0, 0.5, 'Age')

scaler = StandardScaler()

scaler.fit(df[['Age']])
df['Age'] = scaler.transform(df[['Age']])

scaler.fit(df[['Salary']])
df['Salary'] = scaler.transform(df[['Salary']])

df.head()

range_n_clusters = range(2, 11)

# Compute average silhouette score for each number of clusters
silhouette_scores = []
for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(df[['Age', 'Salary']])
    score = silhouette_score(df[['Age', 'Salary']], labels)
    silhouette_scores.append(score)

# Find the optimal number of clusters based on the highest silhouette score
optimal_clusters = range_n_clusters[silhouette_scores.index(max(silhouette_scores))]

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method')
plt.axvline(x=optimal_clusters, color='red', linestyle='--', label=f'Optimal Clusters: {optimal_clusters}')
plt.legend()
plt.show()

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df[['Age', 'Salary']])
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='x', color='blue', label=' Within-Cluster Sum of Squares')

# Add the dashed red line for the optimal number of clusters
plt.axvline(x=optimal_clusters, color='red', linestyle='--', 
            linewidth=2, label=f'Optimal K (Silhouette): {optimal_clusters}')

plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel(' Within-Cluster Sum of Squares')
plt.legend()
plt.grid(True, linestyle=':', alpha=0.6)
plt.show()

# Train model
km = KMeans(n_clusters=optimal_clusters)

# Perform clustering
y_predicted = km.fit_predict(df[['Age','Salary']])

# Append cluster group to dataframe
df['cluster']=y_predicted

df.sample(10)

plt.figure(figsize=(10,6))
sns.scatterplot(x='Age', y='Salary', hue='cluster', data=df, palette='viridis', s=100)

# Plot the centroids
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], 
            color='red', marker='*', s=200, label='Centroids')
plt.title(f'K-Means Clustering (k={optimal_clusters})')
plt.legend()
plt.show()

	Salary	Age
0	36339	30
1	32362	29
2	32595	35
3	39178	30
4	34914	37

	Salary	Age
0	-1.142513	-0.588249
1	-1.376498	-0.677552
2	-1.362789	-0.141733
3	-0.975483	-0.588249
4	-1.226352	0.036874

	Salary	Age	cluster
117	0.327698	-0.766855	0
73	1.106136	1.376422	1
42	-1.177402	-0.588249	2
153	-0.213578	-0.945462	0
141	0.188437	-0.409643	0
44	-1.284422	-0.677552	2
67	0.913394	0.929906	1
121	0.241271	-0.856159	0
86	1.149497	1.465725	1
77	0.882389	1.019209	1

Import modules¶

Import data¶

Visualise data¶

Scale data¶

Determine ideal number for k (number of groups)¶

Train model¶

Visualise model results¶