from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
df = pd.read_excel("salaries_and_age.xlsx")
df.head()
Salary | Age | |
---|---|---|
0 | 36339 | 30 |
1 | 32362 | 29 |
2 | 32595 | 35 |
3 | 39178 | 30 |
4 | 34914 | 37 |
plt.scatter(df.Salary,df.Age)
plt.xlabel('Salary')
plt.ylabel('Age')
Text(0, 0.5, 'Age')
scaler = StandardScaler()
scaler.fit(df[['Age']])
df['Age'] = scaler.transform(df[['Age']])
scaler.fit(df[['Salary']])
df['Salary'] = scaler.transform(df[['Salary']])
df.head()
Salary | Age | |
---|---|---|
0 | -1.142513 | -0.588249 |
1 | -1.376498 | -0.677552 |
2 | -1.362789 | -0.141733 |
3 | -0.975483 | -0.588249 |
4 | -1.226352 | 0.036874 |
range_n_clusters = range(2, 11)
# Compute average silhouette score for each number of clusters
silhouette_scores = []
for n_clusters in range_n_clusters:
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(df[['Age', 'Salary']])
score = silhouette_score(df[['Age', 'Salary']], labels)
silhouette_scores.append(score)
# Find the optimal number of clusters based on the highest silhouette score
optimal_clusters = range_n_clusters[silhouette_scores.index(max(silhouette_scores))]
# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method')
plt.axvline(x=optimal_clusters, color='red', linestyle='--', label=f'Optimal Clusters: {optimal_clusters}')
plt.legend()
plt.show()
# Train model
km = KMeans(n_clusters=optimal_clusters)
# Perform clustering
y_predicted = km.fit_predict(df[['Age','Salary']])
# Append cluster group to dataframe
df['cluster']=y_predicted
df.sample(10)
Salary | Age | cluster | |
---|---|---|---|
67 | 0.913394 | 0.929906 | 1 |
25 | -1.317428 | 0.572693 | 2 |
24 | -1.420329 | -1.302675 | 2 |
12 | -1.530055 | -1.124068 | 2 |
72 | 1.156674 | 0.840603 | 1 |
47 | -1.209114 | -0.677552 | 2 |
46 | -1.305014 | -1.124068 | 2 |
134 | 0.353350 | -0.766855 | 0 |
116 | -0.032898 | -0.945462 | 0 |
76 | 1.053067 | 1.376422 | 1 |
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Age,df1['Salary'],color='green')
plt.scatter(df2.Age,df2['Salary'],color='red')
plt.scatter(df3.Age,df3['Salary'],color='black')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',label='centroid')
plt.legend()
<matplotlib.legend.Legend at 0x1e4cf734590>