In this notebook I compare the performance of the DecisionTreeClassifier, LogisticRegression, GaussianNB, RandomForestClassifier, KNeighborsClassifier, SVC, and GradientBoostingClassifier models in predicting the outcome (lived, died) of COVID-19 infection.¶

In [1]:
rand_state = 42

Imports¶

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')
In [3]:
df = pd.read_excel("Covid.xlsx", sheet_name='coronvirus_march_2020')
print("df has " + str(df.shape[0]) + " rows and " + str(df.shape[1]) + " columns")
df.head()
df has 8044 rows and 6 columns
Out[3]:
has_asthma has_diabetes age bmi cholesterol outcome
0 no yes 41 21 191 lived
1 no no 51 18 239 lived
2 no yes 99 24 259 died
3 no yes 76 26 268 lived
4 no yes 48 25 218 lived
In [4]:
scores = {}

Prepare and examine data¶

In [5]:
df = df.replace(['no'],0)
df = df.replace(['yes'],1)
df.head()
Out[5]:
has_asthma has_diabetes age bmi cholesterol outcome
0 0 1 41 21 191 lived
1 0 0 51 18 239 lived
2 0 1 99 24 259 died
3 0 1 76 26 268 lived
4 0 1 48 25 218 lived
In [6]:
X = df.drop('outcome',axis=1)
y = df["outcome"]
feature_importance = X.head(0) # used at end of notebook
y.value_counts()
Out[6]:
lived    5713
died     2331
Name: outcome, dtype: int64
In [7]:
from imblearn.over_sampling import RandomOverSampler
rus = RandomOverSampler(sampling_strategy =1)
X, y = rus.fit_resample(X, y)
y.value_counts()
Out[7]:
lived    5713
died     5713
Name: outcome, dtype: int64
In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = rand_state)
In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Function to create confusion matrix dataframe¶

In [10]:
def createConfusionMatrixDataframe(confusion_matrix):
    ConfusionMatrix = pd.DataFrame(confusion_matrix,
                 index = ['Actual Positive', 'Actual Negative'],
                 columns = ['Predicted Positive', 'Predicted Negative']
                )
    
    return ConfusionMatrix   

Decision Tree¶

In [11]:
DecisionTree = DecisionTreeClassifier(criterion = 'entropy', random_state = rand_state, max_depth = 6)
DecisionTree.fit(X_train, y_train)
DecisionTree_predicted = DecisionTree.predict(X_test)
DecisionTree_conf_matrix = confusion_matrix(y_test, DecisionTree_predicted)
DecisionTree_acc_score = accuracy_score(y_test, DecisionTree_predicted)
In [12]:
ConfusionMatrix = createConfusionMatrixDataframe(DecisionTree_conf_matrix)
ConfusionMatrix
Out[12]:
Predicted Positive Predicted Negative
Actual Positive 1300 146
Actual Negative 175 1236
In [13]:
print("Decision Tree Accuracy (%):",round(DecisionTree_acc_score*100,3),'\n')
print(classification_report(y_test,DecisionTree_predicted))
scores["Decision Tree"] = DecisionTree_acc_score
Decision Tree Accuracy (%): 88.764 

              precision    recall  f1-score   support

        died       0.88      0.90      0.89      1446
       lived       0.89      0.88      0.89      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.89      0.89      2857
weighted avg       0.89      0.89      0.89      2857

Logistic Regression¶

In [14]:
LogisticRegression = LogisticRegression()
LogisticRegression = LogisticRegression.fit(X_train, y_train)
LogisticRegression_predict = LogisticRegression.predict(X_test)
LogisticRegression_conf_matrix = confusion_matrix(y_test, LogisticRegression_predict)
LogisticRegression_acc_score = accuracy_score(y_test, LogisticRegression_predict)
In [15]:
ConfusionMatrix = createConfusionMatrixDataframe(LogisticRegression_conf_matrix)
ConfusionMatrix
Out[15]:
Predicted Positive Predicted Negative
Actual Positive 1202 244
Actual Negative 188 1223
In [16]:
print("Logistic Regression accuracy (%):",round(LogisticRegression_acc_score*100,3),'\n')
print(classification_report(y_test,LogisticRegression_predict))
scores["Logistic Regression"] = LogisticRegression_acc_score
Logistic Regression accuracy (%): 84.879 

              precision    recall  f1-score   support

        died       0.86      0.83      0.85      1446
       lived       0.83      0.87      0.85      1411

    accuracy                           0.85      2857
   macro avg       0.85      0.85      0.85      2857
weighted avg       0.85      0.85      0.85      2857

Naive Bayes¶

In [17]:
NaiveBayes = GaussianNB()
NaiveBayes = NaiveBayes.fit(X_train,y_train)
NaiveBayes_predicted = NaiveBayes.predict(X_test)
NaiveBayes_conf_matrix = confusion_matrix(y_test, NaiveBayes_predicted)
NaiveBayes_acc_score = accuracy_score(y_test, NaiveBayes_predicted)
In [18]:
ConfusionMatrix = createConfusionMatrixDataframe(NaiveBayes_conf_matrix)
ConfusionMatrix
Out[18]:
Predicted Positive Predicted Negative
Actual Positive 1209 237
Actual Negative 139 1272
In [19]:
print("Naive Bayes accuracy (%):",round(NaiveBayes_acc_score*100,3),'\n')
print(classification_report(y_test,NaiveBayes_predicted))
scores["Naive Bayes"] = NaiveBayes_acc_score
Naive Bayes accuracy (%): 86.839 

              precision    recall  f1-score   support

        died       0.90      0.84      0.87      1446
       lived       0.84      0.90      0.87      1411

    accuracy                           0.87      2857
   macro avg       0.87      0.87      0.87      2857
weighted avg       0.87      0.87      0.87      2857

Random Forrest¶

In [20]:
RandomForest = RandomForestClassifier(n_estimators = 20, random_state = rand_state, max_depth = 5)
RandomForest.fit(X_train,y_train)
RandomForest_predicted = RandomForest.predict(X_test)
RandomForest_conf_matrix = confusion_matrix(y_test, RandomForest_predicted)
RandomForest_acc_score = accuracy_score(y_test, RandomForest_predicted)
In [21]:
ConfusionMatrix = createConfusionMatrixDataframe(RandomForest_conf_matrix)
ConfusionMatrix
Out[21]:
Predicted Positive Predicted Negative
Actual Positive 1290 156
Actual Negative 151 1260
In [22]:
print("Random Forest accuracy (%):",round(RandomForest_acc_score*100,3),'\n')
print(classification_report(y_test,RandomForest_predicted))
scores["Random Forest"] = RandomForest_acc_score
Random Forest accuracy (%): 89.254 

              precision    recall  f1-score   support

        died       0.90      0.89      0.89      1446
       lived       0.89      0.89      0.89      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.89      0.89      2857
weighted avg       0.89      0.89      0.89      2857

K Neighbors¶

In [23]:
KNeighbor = KNeighborsClassifier(n_neighbors=7)
KNeighbor.fit(X_train, y_train)
KNeighbor_predicted = KNeighbor.predict(X_test)
KNeighbor_conf_matrix = confusion_matrix(y_test, KNeighbor_predicted)
KNeighbor_acc_score = accuracy_score(y_test, KNeighbor_predicted)
In [24]:
ConfusionMatrix = createConfusionMatrixDataframe(KNeighbor_conf_matrix)
ConfusionMatrix
Out[24]:
Predicted Positive Predicted Negative
Actual Positive 1315 131
Actual Negative 191 1220
In [25]:
print("K-Neighbors accuracy (%):",round(KNeighbor_acc_score*100,3),'\n')
print(classification_report(y_test,KNeighbor_predicted))
scores["K-Neighbors"] = KNeighbor_acc_score
K-Neighbors accuracy (%): 88.729 

              precision    recall  f1-score   support

        died       0.87      0.91      0.89      1446
       lived       0.90      0.86      0.88      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.89      0.89      2857
weighted avg       0.89      0.89      0.89      2857

Support Vector¶

In [26]:
SupportVector = SVC(kernel='rbf', C=2)
SupportVector.fit(X_train, y_train)
SupportVector_predicted = SupportVector.predict(X_test)
SupportVector_conf_matrix = confusion_matrix(y_test, SupportVector_predicted)
SupportVector_acc_score = accuracy_score(y_test, SupportVector_predicted)
In [27]:
ConfusionMatrix = createConfusionMatrixDataframe(SupportVector_conf_matrix)
ConfusionMatrix
Out[27]:
Predicted Positive Predicted Negative
Actual Positive 1280 166
Actual Negative 162 1249
In [28]:
print("Support Vector accuracy (%):",round(SupportVector_acc_score*100,3),'\n')
print(classification_report(y_test,SupportVector_predicted))
scores["Support Vector"] = SupportVector_acc_score
Support Vector accuracy (%): 88.519 

              precision    recall  f1-score   support

        died       0.89      0.89      0.89      1446
       lived       0.88      0.89      0.88      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.89      0.89      2857
weighted avg       0.89      0.89      0.89      2857

Gradient Booster¶

In [29]:
GradientBooster = GradientBoostingClassifier()
GradientBooster.fit(X_train, y_train)
GradientBooster_predicted = GradientBooster.predict(X_test)
GradientBooster_conf_matrix = confusion_matrix(y_test, GradientBooster_predicted)
GradientBooster_acc_score = accuracy_score(y_test, GradientBooster_predicted)
In [30]:
ConfusionMatrix = createConfusionMatrixDataframe(GradientBooster_conf_matrix)
ConfusionMatrix
Out[30]:
Predicted Positive Predicted Negative
Actual Positive 1291 155
Actual Negative 145 1266
In [31]:
print("Gradient Booster accuracy (%):",round(GradientBooster_acc_score*100,3),'\n')
print(classification_report(y_test,GradientBooster_predicted))
scores["Gradient Booster"] = GradientBooster_acc_score
Gradient Booster accuracy (%): 89.499 

              precision    recall  f1-score   support

        died       0.90      0.89      0.90      1446
       lived       0.89      0.90      0.89      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.90      0.89      2857
weighted avg       0.90      0.89      0.89      2857

Show performance of all models¶

In [32]:
accuracy = pd.DataFrame.from_dict(scores, orient='index', columns=['Accuracy']) 
accuracy = accuracy.sort_values('Accuracy', ascending=False)
print("The best performing model was the: " + accuracy.reset_index().iloc[0,0])
accuracy
The best performing model was the: Gradient Booster
Out[32]:
Accuracy
Gradient Booster 0.894995
Random Forest 0.892545
Decision Tree 0.887644
K-Neighbors 0.887294
Support Vector 0.885194
Naive Bayes 0.868393
Logistic Regression 0.848792

Feature importance (using Random Forest)¶

In [33]:
feature_importances = RandomForest.feature_importances_
In [34]:
feature_importance.loc[0] = feature_importances
In [35]:
feature_importance = feature_importance.T
feature_importance = feature_importance.rename(columns={feature_importance.columns[0]: 'Importance Score'})
In [36]:
feature_importance = feature_importance.sort_values(['Importance Score'], ascending = [False])
print("The most important feature was: " + feature_importance.reset_index().iloc[0,0])
feature_importance
The most important feature was: age
Out[36]:
Importance Score
age 0.752708
bmi 0.235472
cholesterol 0.009161
has_diabetes 0.001359
has_asthma 0.001299
In [37]:
feature_importance['Importance Score'].plot(kind='bar')
plt.xticks(range(len(feature_importance.index)), feature_importance.index) 
plt.ylabel('Importance Score')
plt.title('Features by Importance Score')
plt.show()