rand_state = 42


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')


df = pd.read_excel("Covid.xlsx", sheet_name='coronvirus_march_2020')
print("df has " + str(df.shape[0]) + " rows and " + str(df.shape[1]) + " columns")
df.head()

df has 8044 rows and 6 columns


scores = {}


df = df.replace(['no'],0)
df = df.replace(['yes'],1)
df.head()


X = df.drop('outcome',axis=1)
y = df["outcome"]
feature_importance = X.head(0) # used at end of notebook
y.value_counts()

lived    5713
died     2331
Name: outcome, dtype: int64


from imblearn.over_sampling import RandomOverSampler
rus = RandomOverSampler(sampling_strategy =1)
X, y = rus.fit_resample(X, y)
y.value_counts()

lived    5713
died     5713
Name: outcome, dtype: int64


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = rand_state)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


def createConfusionMatrixDataframe(confusion_matrix):
    ConfusionMatrix = pd.DataFrame(confusion_matrix,
                 index = ['Actual Positive', 'Actual Negative'],
                 columns = ['Predicted Positive', 'Predicted Negative']
                )
    
    return ConfusionMatrix


DecisionTree = DecisionTreeClassifier(criterion = 'entropy', random_state = rand_state, max_depth = 6)
DecisionTree.fit(X_train, y_train)
DecisionTree_predicted = DecisionTree.predict(X_test)
DecisionTree_conf_matrix = confusion_matrix(y_test, DecisionTree_predicted)
DecisionTree_acc_score = accuracy_score(y_test, DecisionTree_predicted)


ConfusionMatrix = createConfusionMatrixDataframe(DecisionTree_conf_matrix)
ConfusionMatrix


print("Decision Tree Accuracy (%):",round(DecisionTree_acc_score*100,3),'\n')
print(classification_report(y_test,DecisionTree_predicted))
scores["Decision Tree"] = DecisionTree_acc_score

Decision Tree Accuracy (%): 88.764 

              precision    recall  f1-score   support

        died       0.88      0.90      0.89      1446
       lived       0.89      0.88      0.89      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.89      0.89      2857
weighted avg       0.89      0.89      0.89      2857


LogisticRegression = LogisticRegression()
LogisticRegression = LogisticRegression.fit(X_train, y_train)
LogisticRegression_predict = LogisticRegression.predict(X_test)
LogisticRegression_conf_matrix = confusion_matrix(y_test, LogisticRegression_predict)
LogisticRegression_acc_score = accuracy_score(y_test, LogisticRegression_predict)


ConfusionMatrix = createConfusionMatrixDataframe(LogisticRegression_conf_matrix)
ConfusionMatrix


print("Logistic Regression accuracy (%):",round(LogisticRegression_acc_score*100,3),'\n')
print(classification_report(y_test,LogisticRegression_predict))
scores["Logistic Regression"] = LogisticRegression_acc_score

Logistic Regression accuracy (%): 84.879 

              precision    recall  f1-score   support

        died       0.86      0.83      0.85      1446
       lived       0.83      0.87      0.85      1411

    accuracy                           0.85      2857
   macro avg       0.85      0.85      0.85      2857
weighted avg       0.85      0.85      0.85      2857


NaiveBayes = GaussianNB()
NaiveBayes = NaiveBayes.fit(X_train,y_train)
NaiveBayes_predicted = NaiveBayes.predict(X_test)
NaiveBayes_conf_matrix = confusion_matrix(y_test, NaiveBayes_predicted)
NaiveBayes_acc_score = accuracy_score(y_test, NaiveBayes_predicted)


ConfusionMatrix = createConfusionMatrixDataframe(NaiveBayes_conf_matrix)
ConfusionMatrix


print("Naive Bayes accuracy (%):",round(NaiveBayes_acc_score*100,3),'\n')
print(classification_report(y_test,NaiveBayes_predicted))
scores["Naive Bayes"] = NaiveBayes_acc_score

Naive Bayes accuracy (%): 86.839 

              precision    recall  f1-score   support

        died       0.90      0.84      0.87      1446
       lived       0.84      0.90      0.87      1411

    accuracy                           0.87      2857
   macro avg       0.87      0.87      0.87      2857
weighted avg       0.87      0.87      0.87      2857


RandomForest = RandomForestClassifier(n_estimators = 20, random_state = rand_state, max_depth = 5)
RandomForest.fit(X_train,y_train)
RandomForest_predicted = RandomForest.predict(X_test)
RandomForest_conf_matrix = confusion_matrix(y_test, RandomForest_predicted)
RandomForest_acc_score = accuracy_score(y_test, RandomForest_predicted)


ConfusionMatrix = createConfusionMatrixDataframe(RandomForest_conf_matrix)
ConfusionMatrix


print("Random Forest accuracy (%):",round(RandomForest_acc_score*100,3),'\n')
print(classification_report(y_test,RandomForest_predicted))
scores["Random Forest"] = RandomForest_acc_score

Random Forest accuracy (%): 89.254 

              precision    recall  f1-score   support

        died       0.90      0.89      0.89      1446
       lived       0.89      0.89      0.89      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.89      0.89      2857
weighted avg       0.89      0.89      0.89      2857


KNeighbor = KNeighborsClassifier(n_neighbors=7)
KNeighbor.fit(X_train, y_train)
KNeighbor_predicted = KNeighbor.predict(X_test)
KNeighbor_conf_matrix = confusion_matrix(y_test, KNeighbor_predicted)
KNeighbor_acc_score = accuracy_score(y_test, KNeighbor_predicted)


ConfusionMatrix = createConfusionMatrixDataframe(KNeighbor_conf_matrix)
ConfusionMatrix


print("K-Neighbors accuracy (%):",round(KNeighbor_acc_score*100,3),'\n')
print(classification_report(y_test,KNeighbor_predicted))
scores["K-Neighbors"] = KNeighbor_acc_score

K-Neighbors accuracy (%): 88.729 

              precision    recall  f1-score   support

        died       0.87      0.91      0.89      1446
       lived       0.90      0.86      0.88      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.89      0.89      2857
weighted avg       0.89      0.89      0.89      2857


SupportVector = SVC(kernel='rbf', C=2)
SupportVector.fit(X_train, y_train)
SupportVector_predicted = SupportVector.predict(X_test)
SupportVector_conf_matrix = confusion_matrix(y_test, SupportVector_predicted)
SupportVector_acc_score = accuracy_score(y_test, SupportVector_predicted)


ConfusionMatrix = createConfusionMatrixDataframe(SupportVector_conf_matrix)
ConfusionMatrix


print("Support Vector accuracy (%):",round(SupportVector_acc_score*100,3),'\n')
print(classification_report(y_test,SupportVector_predicted))
scores["Support Vector"] = SupportVector_acc_score

Support Vector accuracy (%): 88.519 

              precision    recall  f1-score   support

        died       0.89      0.89      0.89      1446
       lived       0.88      0.89      0.88      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.89      0.89      2857
weighted avg       0.89      0.89      0.89      2857


GradientBooster = GradientBoostingClassifier()
GradientBooster.fit(X_train, y_train)
GradientBooster_predicted = GradientBooster.predict(X_test)
GradientBooster_conf_matrix = confusion_matrix(y_test, GradientBooster_predicted)
GradientBooster_acc_score = accuracy_score(y_test, GradientBooster_predicted)


ConfusionMatrix = createConfusionMatrixDataframe(GradientBooster_conf_matrix)
ConfusionMatrix


print("Gradient Booster accuracy (%):",round(GradientBooster_acc_score*100,3),'\n')
print(classification_report(y_test,GradientBooster_predicted))
scores["Gradient Booster"] = GradientBooster_acc_score

Gradient Booster accuracy (%): 89.499 

              precision    recall  f1-score   support

        died       0.90      0.89      0.90      1446
       lived       0.89      0.90      0.89      1411

    accuracy                           0.89      2857
   macro avg       0.89      0.90      0.89      2857
weighted avg       0.90      0.89      0.89      2857


accuracy = pd.DataFrame.from_dict(scores, orient='index', columns=['Accuracy']) 
accuracy = accuracy.sort_values('Accuracy', ascending=False)
print("The best performing model was the: " + accuracy.reset_index().iloc[0,0])
accuracy

The best performing model was the: Gradient Booster


feature_importances = RandomForest.feature_importances_


feature_importance.loc[0] = feature_importances


feature_importance = feature_importance.T
feature_importance = feature_importance.rename(columns={feature_importance.columns[0]: 'Importance Score'})


feature_importance = feature_importance.sort_values(['Importance Score'], ascending = [False])
print("The most important feature was: " + feature_importance.reset_index().iloc[0,0])
feature_importance

The most important feature was: age


feature_importance['Importance Score'].plot(kind='bar')
plt.xticks(range(len(feature_importance.index)), feature_importance.index) 
plt.ylabel('Importance Score')
plt.title('Features by Importance Score')
plt.show()

	has_diabetes	age	bmi	cholesterol	outcome
0	1	41	21	191	lived
1	0	51	18	239	lived
2	1	99	24	259	died
3	1	76	26	268	lived
4	1	48	25	218	lived

	Importance Score
age	0.752708
bmi	0.235472
cholesterol	0.009161
has_diabetes	0.001359
has_asthma	0.001299

In this notebook I compare the performance of the DecisionTreeClassifier, LogisticRegression, GaussianNB, RandomForestClassifier, KNeighborsClassifier, SVC, and GradientBoostingClassifier models in predicting the outcome (lived, died) of COVID-19 infection.¶

Imports¶

Prepare and examine data¶

Function to create confusion matrix dataframe¶

Decision Tree¶

Logistic Regression¶

Naive Bayes¶

Random Forrest¶

K Neighbors¶

Support Vector¶

Gradient Booster¶

Show performance of all models¶

Feature importance (using Random Forest)¶

	Accuracy
Gradient Booster	0.894995
Random Forest	0.892545
Decision Tree	0.887644
K-Neighbors	0.887294
Support Vector	0.885194
Naive Bayes	0.868393
Logistic Regression	0.848792

	Predicted Positive	Predicted Negative
Actual Positive	1300	146
Actual Negative	175	1236

	Predicted Positive	Predicted Negative
Actual Positive	1202	244
Actual Negative	188	1223

	Predicted Positive	Predicted Negative
Actual Positive	1209	237
Actual Negative	139	1272

	Predicted Positive	Predicted Negative
Actual Positive	1290	156
Actual Negative	151	1260