rand_state = 42
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')
df = pd.read_excel("Covid.xlsx", sheet_name='coronvirus_march_2020')
print("df has " + str(df.shape[0]) + " rows and " + str(df.shape[1]) + " columns")
df.head()
df has 8044 rows and 6 columns
| has_asthma | has_diabetes | age | bmi | cholesterol | outcome | |
|---|---|---|---|---|---|---|
| 0 | no | yes | 41 | 21 | 191 | lived |
| 1 | no | no | 51 | 18 | 239 | lived |
| 2 | no | yes | 99 | 24 | 259 | died |
| 3 | no | yes | 76 | 26 | 268 | lived |
| 4 | no | yes | 48 | 25 | 218 | lived |
scores = {}
df = df.replace(['no'],0)
df = df.replace(['yes'],1)
df.head()
| has_asthma | has_diabetes | age | bmi | cholesterol | outcome | |
|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 41 | 21 | 191 | lived |
| 1 | 0 | 0 | 51 | 18 | 239 | lived |
| 2 | 0 | 1 | 99 | 24 | 259 | died |
| 3 | 0 | 1 | 76 | 26 | 268 | lived |
| 4 | 0 | 1 | 48 | 25 | 218 | lived |
X = df.drop('outcome',axis=1)
y = df["outcome"]
feature_importance = X.head(0) # used at end of notebook
y.value_counts()
lived 5713 died 2331 Name: outcome, dtype: int64
from imblearn.over_sampling import RandomOverSampler
rus = RandomOverSampler(sampling_strategy =1)
X, y = rus.fit_resample(X, y)
y.value_counts()
lived 5713 died 5713 Name: outcome, dtype: int64
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = rand_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
def createConfusionMatrixDataframe(confusion_matrix):
ConfusionMatrix = pd.DataFrame(confusion_matrix,
index = ['Actual Positive', 'Actual Negative'],
columns = ['Predicted Positive', 'Predicted Negative']
)
return ConfusionMatrix
DecisionTree = DecisionTreeClassifier(criterion = 'entropy', random_state = rand_state, max_depth = 6)
DecisionTree.fit(X_train, y_train)
DecisionTree_predicted = DecisionTree.predict(X_test)
DecisionTree_conf_matrix = confusion_matrix(y_test, DecisionTree_predicted)
DecisionTree_acc_score = accuracy_score(y_test, DecisionTree_predicted)
ConfusionMatrix = createConfusionMatrixDataframe(DecisionTree_conf_matrix)
ConfusionMatrix
| Predicted Positive | Predicted Negative | |
|---|---|---|
| Actual Positive | 1300 | 146 |
| Actual Negative | 175 | 1236 |
print("Decision Tree Accuracy (%):",round(DecisionTree_acc_score*100,3),'\n')
print(classification_report(y_test,DecisionTree_predicted))
scores["Decision Tree"] = DecisionTree_acc_score
Decision Tree Accuracy (%): 88.764
precision recall f1-score support
died 0.88 0.90 0.89 1446
lived 0.89 0.88 0.89 1411
accuracy 0.89 2857
macro avg 0.89 0.89 0.89 2857
weighted avg 0.89 0.89 0.89 2857
LogisticRegression = LogisticRegression()
LogisticRegression = LogisticRegression.fit(X_train, y_train)
LogisticRegression_predict = LogisticRegression.predict(X_test)
LogisticRegression_conf_matrix = confusion_matrix(y_test, LogisticRegression_predict)
LogisticRegression_acc_score = accuracy_score(y_test, LogisticRegression_predict)
ConfusionMatrix = createConfusionMatrixDataframe(LogisticRegression_conf_matrix)
ConfusionMatrix
| Predicted Positive | Predicted Negative | |
|---|---|---|
| Actual Positive | 1202 | 244 |
| Actual Negative | 188 | 1223 |
print("Logistic Regression accuracy (%):",round(LogisticRegression_acc_score*100,3),'\n')
print(classification_report(y_test,LogisticRegression_predict))
scores["Logistic Regression"] = LogisticRegression_acc_score
Logistic Regression accuracy (%): 84.879
precision recall f1-score support
died 0.86 0.83 0.85 1446
lived 0.83 0.87 0.85 1411
accuracy 0.85 2857
macro avg 0.85 0.85 0.85 2857
weighted avg 0.85 0.85 0.85 2857
NaiveBayes = GaussianNB()
NaiveBayes = NaiveBayes.fit(X_train,y_train)
NaiveBayes_predicted = NaiveBayes.predict(X_test)
NaiveBayes_conf_matrix = confusion_matrix(y_test, NaiveBayes_predicted)
NaiveBayes_acc_score = accuracy_score(y_test, NaiveBayes_predicted)
ConfusionMatrix = createConfusionMatrixDataframe(NaiveBayes_conf_matrix)
ConfusionMatrix
| Predicted Positive | Predicted Negative | |
|---|---|---|
| Actual Positive | 1209 | 237 |
| Actual Negative | 139 | 1272 |
print("Naive Bayes accuracy (%):",round(NaiveBayes_acc_score*100,3),'\n')
print(classification_report(y_test,NaiveBayes_predicted))
scores["Naive Bayes"] = NaiveBayes_acc_score
Naive Bayes accuracy (%): 86.839
precision recall f1-score support
died 0.90 0.84 0.87 1446
lived 0.84 0.90 0.87 1411
accuracy 0.87 2857
macro avg 0.87 0.87 0.87 2857
weighted avg 0.87 0.87 0.87 2857
RandomForest = RandomForestClassifier(n_estimators = 20, random_state = rand_state, max_depth = 5)
RandomForest.fit(X_train,y_train)
RandomForest_predicted = RandomForest.predict(X_test)
RandomForest_conf_matrix = confusion_matrix(y_test, RandomForest_predicted)
RandomForest_acc_score = accuracy_score(y_test, RandomForest_predicted)
ConfusionMatrix = createConfusionMatrixDataframe(RandomForest_conf_matrix)
ConfusionMatrix
| Predicted Positive | Predicted Negative | |
|---|---|---|
| Actual Positive | 1290 | 156 |
| Actual Negative | 151 | 1260 |
print("Random Forest accuracy (%):",round(RandomForest_acc_score*100,3),'\n')
print(classification_report(y_test,RandomForest_predicted))
scores["Random Forest"] = RandomForest_acc_score
Random Forest accuracy (%): 89.254
precision recall f1-score support
died 0.90 0.89 0.89 1446
lived 0.89 0.89 0.89 1411
accuracy 0.89 2857
macro avg 0.89 0.89 0.89 2857
weighted avg 0.89 0.89 0.89 2857
KNeighbor = KNeighborsClassifier(n_neighbors=7)
KNeighbor.fit(X_train, y_train)
KNeighbor_predicted = KNeighbor.predict(X_test)
KNeighbor_conf_matrix = confusion_matrix(y_test, KNeighbor_predicted)
KNeighbor_acc_score = accuracy_score(y_test, KNeighbor_predicted)
ConfusionMatrix = createConfusionMatrixDataframe(KNeighbor_conf_matrix)
ConfusionMatrix
| Predicted Positive | Predicted Negative | |
|---|---|---|
| Actual Positive | 1315 | 131 |
| Actual Negative | 191 | 1220 |
print("K-Neighbors accuracy (%):",round(KNeighbor_acc_score*100,3),'\n')
print(classification_report(y_test,KNeighbor_predicted))
scores["K-Neighbors"] = KNeighbor_acc_score
K-Neighbors accuracy (%): 88.729
precision recall f1-score support
died 0.87 0.91 0.89 1446
lived 0.90 0.86 0.88 1411
accuracy 0.89 2857
macro avg 0.89 0.89 0.89 2857
weighted avg 0.89 0.89 0.89 2857
SupportVector = SVC(kernel='rbf', C=2)
SupportVector.fit(X_train, y_train)
SupportVector_predicted = SupportVector.predict(X_test)
SupportVector_conf_matrix = confusion_matrix(y_test, SupportVector_predicted)
SupportVector_acc_score = accuracy_score(y_test, SupportVector_predicted)
ConfusionMatrix = createConfusionMatrixDataframe(SupportVector_conf_matrix)
ConfusionMatrix
| Predicted Positive | Predicted Negative | |
|---|---|---|
| Actual Positive | 1280 | 166 |
| Actual Negative | 162 | 1249 |
print("Support Vector accuracy (%):",round(SupportVector_acc_score*100,3),'\n')
print(classification_report(y_test,SupportVector_predicted))
scores["Support Vector"] = SupportVector_acc_score
Support Vector accuracy (%): 88.519
precision recall f1-score support
died 0.89 0.89 0.89 1446
lived 0.88 0.89 0.88 1411
accuracy 0.89 2857
macro avg 0.89 0.89 0.89 2857
weighted avg 0.89 0.89 0.89 2857
GradientBooster = GradientBoostingClassifier()
GradientBooster.fit(X_train, y_train)
GradientBooster_predicted = GradientBooster.predict(X_test)
GradientBooster_conf_matrix = confusion_matrix(y_test, GradientBooster_predicted)
GradientBooster_acc_score = accuracy_score(y_test, GradientBooster_predicted)
ConfusionMatrix = createConfusionMatrixDataframe(GradientBooster_conf_matrix)
ConfusionMatrix
| Predicted Positive | Predicted Negative | |
|---|---|---|
| Actual Positive | 1291 | 155 |
| Actual Negative | 145 | 1266 |
print("Gradient Booster accuracy (%):",round(GradientBooster_acc_score*100,3),'\n')
print(classification_report(y_test,GradientBooster_predicted))
scores["Gradient Booster"] = GradientBooster_acc_score
Gradient Booster accuracy (%): 89.499
precision recall f1-score support
died 0.90 0.89 0.90 1446
lived 0.89 0.90 0.89 1411
accuracy 0.89 2857
macro avg 0.89 0.90 0.89 2857
weighted avg 0.90 0.89 0.89 2857
accuracy = pd.DataFrame.from_dict(scores, orient='index', columns=['Accuracy'])
accuracy = accuracy.sort_values('Accuracy', ascending=False)
print("The best performing model was the: " + accuracy.reset_index().iloc[0,0])
accuracy
The best performing model was the: Gradient Booster
| Accuracy | |
|---|---|
| Gradient Booster | 0.894995 |
| Random Forest | 0.892545 |
| Decision Tree | 0.887644 |
| K-Neighbors | 0.887294 |
| Support Vector | 0.885194 |
| Naive Bayes | 0.868393 |
| Logistic Regression | 0.848792 |
feature_importances = RandomForest.feature_importances_
feature_importance.loc[0] = feature_importances
feature_importance = feature_importance.T
feature_importance = feature_importance.rename(columns={feature_importance.columns[0]: 'Importance Score'})
feature_importance = feature_importance.sort_values(['Importance Score'], ascending = [False])
print("The most important feature was: " + feature_importance.reset_index().iloc[0,0])
feature_importance
The most important feature was: age
| Importance Score | |
|---|---|
| age | 0.752708 |
| bmi | 0.235472 |
| cholesterol | 0.009161 |
| has_diabetes | 0.001359 |
| has_asthma | 0.001299 |
feature_importance['Importance Score'].plot(kind='bar')
plt.xticks(range(len(feature_importance.index)), feature_importance.index)
plt.ylabel('Importance Score')
plt.title('Features by Importance Score')
plt.show()