This Python notebook describes several statistical tests and concepts, presented in alphabetical order.¶
# Import modules
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import random
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
C:\Users\lewis\AppData\Local\Temp\ipykernel_26208\3378953940.py:3: DeprecationWarning:
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
import pandas as pd
Anderson Darling Normality Test¶
This test determines if a data set is normally distributed.
from scipy.stats import anderson
normally_distibuted_data = np.random.normal(size=1000)
anderson_result = anderson(normally_distibuted_data)
significance_level, critical_value = anderson_result.significance_level[2], anderson_result.critical_values[2]
if anderson_result.statistic < critical_value:
print('Data follows normal distribution (p ≤ 0.05).')
else:
print('Data does not follows normal distribution')
print("The statistic is: " + str(anderson_result.statistic))
Data follows normal distribution (p ≤ 0.05). The statistic is: 0.35825734505897344
ANOVA (and Tukey)¶
Independent one way
The ANOVA is used to determine whether there are any statistically significant differences between the means of three or more independent groups. The Tukey test is used to determine where the differences lie.
from scipy.stats import f_oneway # for ANOVA
from statsmodels.stats.multicomp import pairwise_tukeyhsd # for Tukey test
# Create data
df = pd.DataFrame({'score': [85, 86, 88, 75, 78, 94, 98, 79, 71, 80, # group a
91, 92, 93, 90, 97, 94, 82, 88, 95, 96, # group b
79, 78, 88, 94, 92, 85, 83, 85, 82, 81], # group c
'group': np.repeat(['a', 'b', 'c'], repeats=10)})
a = df.score[df.group == 'a'].tolist()
b = df.score[df.group == 'b'].tolist()
c = df.score[df.group == 'c'].tolist()
# Perform ANOVA
statistic, pvalue = f_oneway(a, b, c)
if pvalue >= 0.05:
print("There is no significant difference between the means.")
else:
print("There is a significant difference between the means.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue) + "\n")
# Perform Tukey's test
tukey = pairwise_tukeyhsd(endog=df['score'], groups=df['group'], alpha=0.05)
print(tukey)
There is a significant difference between the means.
The statistic is: 5.167774552944481, the p-value is: 0.012582197136592605
Multiple Comparison of Means - Tukey HSD, FWER=0.05
=====================================================
group1 group2 meandiff p-adj lower upper reject
-----------------------------------------------------
a b 8.4 0.0159 1.4259 15.3741 True
a c 1.3 0.8894 -5.6741 8.2741 False
b c -7.1 0.0454 -14.0741 -0.1259 True
-----------------------------------------------------
Independent two way
This test compares the mean differences between groups that have been split on two independent variables (factors). This test determines if there is an interaction between the two independent variables on the dependent variable.
import statsmodels.api as sm
from statsmodels.formula.api import ols
# Create data
df = pd.DataFrame({'water': np.repeat(['daily', 'weekly'], 15),
'sun': np.tile(np.repeat(['low', 'med', 'high'], 5), 2),
'height': [6, 6, 6, 5, 6, 5, 5, 6, 4, 5,
6, 6, 7, 8, 7, 3, 4, 4, 4, 5,
4, 4, 4, 4, 4, 5, 6, 6, 7, 8]})
# Perform ANOVA
model = ols('height ~ C(water) + C(sun) + C(water):C(sun)', data=df).fit()
print(sm.stats.anova_lm(model, typ=2))
sum_sq df F PR(>F) C(water) 8.533333 1.0 16.0000 0.000527 C(sun) 24.866667 2.0 23.3125 0.000002 C(water):C(sun) 2.466667 2.0 2.3125 0.120667 Residual 12.800000 24.0 NaN NaN
Within-subjects one way
This test is used to compare three or more group means where the participants are the same in each group.
import statsmodels.api as sm
from statsmodels.stats.anova import AnovaRM
# Create data
data = {
'subject': ['S1'] * 6 + ['S2'] * 6 + ['S3'] * 6,
'time': ['T1', 'T2', 'T3'] * 6,
'condition': ['C1', 'C1', 'C1', 'C2', 'C2', 'C2'] * 3,
'performance': [
0.1, 0.2, 0.3, # subject 1, condition 1
0.1, 0.2, 0.4, # subject 1, condition 2
0.3, 0.3, 0.4, # subject 2, condition 1
0.3, 0.4, 0.5, # subject 2, condition 2
0.5, 0.6, 0.7, # subject 3, condition 1
0.6, 0.6, 0.8 # subject 3, condition 2
]
}
df = pd.DataFrame(data)
# Perform ANOVA
aovrm = AnovaRM(df, 'performance', 'subject', within=['time', 'condition'])
res = aovrm.fit()
print(res.summary())
Anova
===========================================
F Value Num DF Den DF Pr > F
-------------------------------------------
time 44.8000 2.0000 4.0000 0.0018
condition 25.0000 1.0000 2.0000 0.0377
time:condition 1.6000 2.0000 4.0000 0.3086
===========================================
Augmented Dickey-Fuller Test¶
This test determines if a time series has stationarity (constant mean, variance, and covariance over time).
# Create 2-column dataframe with 'Date' and 'Sales'
data = {'Date': ['12/02/2024', '13/02/2024', '14/02/2024', '15/02/2024', '16/02/2024', '17/02/2024', '18/02/2024', '19/02/2024', '20/02/2024', '21/02/2024', '22/02/2024', '23/02/2024', '24/02/2024', '25/02/2024', '26/02/2024', '27/02/2024', '28/02/2024', '29/02/2024', '01/03/2024', '02/03/2024', '03/03/2024', '04/03/2024'],
'Sales': [1757, 1844, 1612, 1742, 1144, 1778, 1105, 1388, 1936, 1119, 850, 812, 812, 848, 763, 781, 794, 792, 780, 763, 785, 828]}
df = pd.DataFrame(data)
sns.lineplot(x=df['Date'], y=df['Sales'])
plt.title('Sales Over Time', size=15)
plt.xlabel('Time')
plt.ylabel('Sales')
plt.xticks(rotation=90);
from statsmodels.tsa.stattools import adfuller
statistic, pvalue, lags, obs, crit, t = adfuller(df['Sales'])
if pvalue >= 0.05:
print('Data does not exhibit stationarity.')
else:
print('Data does exhibit stationarity.')
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
Data does not exhibit stationarity. The statistic is: 1.1559905031499378, the p-value is: 0.9956672577818104
Barnard's Test¶
This test is used to determine if there is a significant association between two categorical variables in a 2x2 contingency table.
from scipy.stats import barnard_exact
# Vaccine (experiment) group
exp_infected = 9
exp_uninfected = 91
# Placebo (control) group
ctrl_infected = 19
ctrl_uninfected = 81
result = barnard_exact(
[[exp_infected, ctrl_infected],
[exp_uninfected, ctrl_uninfected]]
)
if result.pvalue >= 0.05:
print("There is no significant difference between the groups.")
else:
print("There is a significant difference between the groups.")
print("The statistic is: " + str(round(result.statistic,3)) + ", the p-value is: " + str(round(result.pvalue,3)))
There is a significant difference between the groups. The statistic is: -2.038, the p-value is: 0.044
Bland-Altman plot¶
A Bland-Altman plot is a scatter plot where the differences between two measurements are plotted against their averages.
from pingouin import plot_blandaltman
df = pd.DataFrame({'Before Treatment': [91.3, 107.3, 118.2, 132.3, 115.6, 97.4, 113, 85.1, 95.3, 106.6, 80.2, 91.9, 83.7, 117.8, 123.9],
'After Treatment': [148.8, 118.2, 112.2, 146.3, 129.2, 79.7, 89.3, 69.1, 135.8, 112.4, 116.8, 97.9, 99.8, 99.4, 106.9]})
ax = plot_blandaltman(df['Before Treatment'], df['After Treatment'])
Bartlett’s Test¶
This test determines if the variances in data sets equal.
from scipy.stats import bartlett
A = [85, 86, 88, 75, 78, 94, 98, 79, 71, 80]
B = [91, 92, 93, 85, 87, 84, 82, 88, 95, 96]
C = [79, 78, 88, 94, 92, 85, 83, 85, 82, 81]
statistic, pvalue = bartlett(A, B, C)
if pvalue >= 0.05:
print("There is no significant difference between the variances.")
else:
print("There is a significant difference between the variances.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is no significant difference between the variances. The statistic is: 3.3024375753550594, the p-value is: 0.19181598314035977
Chi-Square (goodness of fit)¶
The chi-square goodness of fit test assesses the differences between the observed and expected proportions.
from scipy.stats import chisquare
observed_data = [8, 6, 10, 7, 8, 11, 9]
expected_data = [9, 8, 11, 8, 10, 7, 6]
statistic, pvalue = chisquare(observed_data, expected_data)
if pvalue >= 0.05:
print("There is no significant difference the observed and expected data.")
else:
print("There is a significant difference the observed and expected data.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is no significant difference the observed and expected data. The statistic is: 5.0127344877344875, the p-value is: 0.542180861413329
Cohen's Kappa¶
This test determines the level of agreement between two raters/judges who each classify items into categories. To determine the level agreement between more than two rates/judges, use Fleiss' Kappa.
from sklearn.metrics import cohen_kappa_score
rater1 = ["negative", "positive", "negative", "neutral", "positive", "negative", "neutral", "positive"]
rater2 = ["positive", "positive", "negative", "neutral", "neutral", "negative", "neutral", "positive"]
kappa = cohen_kappa_score(rater1, rater2)
print("Cohen's Kappa score:", kappa)
if kappa < 0.1:
print("No agreement.")
elif kappa < 0.2:
print("Slight agreement.")
elif kappa < 0.4:
print("Fair agreement.")
elif kappa < 0.6:
print("Moderate agreement.")
elif kappa < 0.8:
print("Substantial agreement.")
elif kappa < 0.99:
print("Near perfect agreement.")
else:
print("Perfect agreement.")
Cohen's Kappa score: 0.627906976744186 Substantial agreement.
Cronbach's Alpha¶
This test measures the internal consistency, or reliability, of a set of survey items (such as Likert scale ratings).
from pingouin import cronbach_alpha
dataframe = pd.DataFrame({
'1': [1, 2, 2, 5, 5, 3, 2, 4],
'2': [2, 2, 2, 4, 5, 1, 3, 3],
'3': [3, 2, 3, 3, 5, 2, 3, 5],
})
statistic = cronbach_alpha(data)[0]
print("The statistic is: " + str(statistic))
if statistic < 0.5:
print("Unacceptable internal consistency.")
elif statistic < 0.6:
print("Poor internal consistency.")
elif statistic < 0.7:
print("Questionable internal consistency.")
elif statistic < 0.8:
print("Acceptable internal consistency.")
elif statistic < 0.9:
print("Good internal consistency.")
else:
print("Excellent internal consistency.")
The statistic is: 0.8546511627906975 Good internal consistency.
D'Agostino's K-squared Test¶
This test determines if a data set is normally distributed.
from scipy.stats import normaltest
normally_distibuted_data = np.random.normal(size=1000)
statistic, pvalue = normaltest(normally_distibuted_data)
if pvalue >= 0.05:
print('Data follows normal distribution.')
else:
print('Data does not follow normal distribution.')
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
Data follows normal distribution. The statistic is: 4.6590595799765975, the p-value is: 0.09734150728151729
Fisher's Test¶
This test is used to determine if there is a significant association between two categorical variables in a 2x2 contingency table.
from scipy.stats import fisher_exact
# Vaccine (experiment) group
exp_infected = 8
exp_uninfected = 92
# Placebo (control) group
ctrl_infected = 19
ctrl_uninfected = 81
result = fisher_exact(
[[exp_infected, ctrl_infected],
[exp_uninfected, ctrl_uninfected]]
)
if result[1] >= 0.05:
print("There is no significant difference between the groups.")
else:
print("There is a significant difference between the groups.")
print("The statistic is: " + str(round(result[0],3)) + ", the p-value is: " + str(round(result[1],3)))
There is a significant difference between the groups. The statistic is: 0.371, the p-value is: 0.037
Friedman Test¶
Friedman's test is a nonparametric test that compares three or more paired groups.
from scipy.stats import friedmanchisquare
measure1 = [4, 6, 3, 4, 3, 2, 2, 7, 6, 5]
measure2 = [5, 6, 8, 5, 7, 8, 4, 6, 4, 5]
measure3 = [2, 4, 4, 3, 2, 2, 2, 4, 3, 2]
statistic, pvalue = friedmanchisquare(measure1, measure2, measure3)
if pvalue >= 0.05:
print("There is no significant difference between the measures.")
else:
print("There is a significant difference between the measures.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is a significant difference between the measures. The statistic is: 12.666666666666673, the p-value is: 0.0017761035457343726
Kendall's Tau Correlation¶
Kendall's Tau is a non-parametric measure of relationships between columns of ranked data.
from scipy.stats import kendalltau
data = {
'candidate': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'],
'ranker1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
'ranker2': [3, 2, 6, 4, 1, 5, 8, 7, 10, 12, 9, 11 ]
}
df = pd.DataFrame(data)
statistic, pvalue = kendalltau(df['ranker1'], df['ranker2'])
if pvalue > 0.05:
print('There is not a relationship between the two rankers.')
else:
print('There is a relationship between the two rankers.')
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is a relationship between the two rankers. The statistic is: 0.6666666666666666, the p-value is: 0.0018032758136924804
Kolmogorov-Smirnov Tests¶
Test of normality
This test determines if data is normally distributed.
from scipy.stats import kstest
data = random.sample(range(0, 99), 80) # assign 80 random (not normally distributed) to list
statistic, pvalue = kstest(data, 'norm')
if pvalue > 0.05:
print("Data is likely normally distributed.")
else:
print("Data unlikely to be normally distributed.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
Data unlikely to be normally distributed. The statistic is: 0.9736501019683699, the p-value is: 9.19062634619366e-127
Two sample test
This test determines whether two samples come from the same distribution.
from scipy.stats import ks_2samp
data1 = np.random.normal(100, 10, 500) # mean, std. dev., size (samples)
data2 = np.random.normal(100, 13, 500) # mean, std. dev., size (samples)
statistic, pvalue = ks_2samp(data1, data2)
if pvalue < 0.05:
print("The sample distributions are different (reject null hypothesis)")
else:
print("The sample distributions are the same (do not reject null hypothesis)")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
The sample distributions are different (reject null hypothesis) The statistic is: 0.088, the p-value is: 0.04158607858587287
Kruskal-Wallis Test¶
The Kruskal Wallis test is a nonparametric test that compares three or more independent groups. It can be thought of as a one-way ANOVA on ranks.
from scipy.stats import kruskal
group1 = [7, 14, 14, 13, 12, 9, 6, 14, 12, 8]
group2 = [15, 17, 13, 15, 15, 13, 9, 12, 10, 8]
group3 = [6, 8, 8, 9, 5, 14, 13, 8, 10, 9]
statistic, pvalue = kruskal(group1, group2, group3)
if pvalue >= 0.05:
print("There is no significant difference between the groups.")
else:
print("There is a significant difference between the groups.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is a significant difference between the groups. The statistic is: 6.287801578353988, the p-value is: 0.043114289703508814
Kurtosis¶
Kurtosis is a measure of the 'tailedness' of a distribution.
data = np.random.normal(loc=20, scale=5, size=70)
distplot = sns.distplot(data, color='green')
from scipy.stats import skew
skew(data)
0.07649949342058045
Levenes's Test¶
This test determines if variances are equal between measures.
from scipy.stats import levene
measure1 = np.random.normal(100, 10, 30) # mean 100, std. dev 10, 30 samples
measure2 = np.random.normal(10, 18, 30) # mean 10, std. dev 18, 30 samples
statistic, pvalue = levene(measure1, measure2)
if pvalue >= 0.05:
print("There is no significant difference between the measures.")
else:
print("There is a significant difference between the measures.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is no significant difference between the measures. The statistic is: 1.7031485666986301, the p-value is: 0.1970306981285739
Logrank Test¶
This test compares the survival distributions of two independent groups.
from lifelines.statistics import logrank_test
df = pd.DataFrame({
'durations': [12, 15, 13, 16, 12, 19, 16, 20, 18, 21],
'events': [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], # 0 is censored, 1 is not censored
'drug_group': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']
})
drug_a = df[df['drug_group'] == 'A']
drug_b = df[df['drug_group'] == 'B']
results = logrank_test(durations_A=drug_a['durations'],
event_observed_A=drug_a['events'],
durations_B=drug_b['durations'],
event_observed_B=drug_b['events'])
statistic, pvalue = results.test_statistic, results.p_value
if pvalue >= 0.05:
print("The distribution curves for each group are the same.")
else:
print("The distribution curves for each group are not the same.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
The distribution curves for each group are not the same. The statistic is: 5.828516377649327, the p-value is: 0.01576841639354035
Mauchly's Test of Sphericity¶
The Mauchly's test of sphericity is used to assess whether or not the assumption of sphericity is met. The assumption of sphericity is met when there is equal variance of difference scores across different measurement periods.
from pingouin import sphericity
df = pd.DataFrame({'A': [65, 92, 28, 68, 24],
'B': [58, 48, 55, 60, 45],
'C': [60, 49, 55, 64, 49]})
pvalue = round(sphericity(df, method='mauchly')[-1], 3)
if pvalue >= 0.05:
print("Variances of the differences are equal.")
else:
print("Variances of the differences are not equal")
print("The p-value is: " + str(pvalue))
Variances of the differences are not equal The p-value is: 0.001
One-Sample Cramér-von Mises Test¶
This test determines whether a data set is normally distributed.
from scipy.stats import cramervonmises
sample = random.sample(range(0, 99), 80) # assign 80 random (not normally distributed) to list
result = cramervonmises(sample, 'norm')
statistic = result.statistic
pvalue = result.pvalue
if pvalue >= 0.05:
print("Data is likely normally distributed.")
else:
print("Data is likely not normally distributed.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
Data is likely not normally distributed. The statistic is: 25.593166406391035, the p-value is: 4.3828350948516e-09
Pearson R Correlation¶
Pearson's correlation coefficient is a measure of the strength of a linear association between two variables.
from scipy.stats import pearsonr
kg = [85, 95, 98, 86, 67, 83, 78, 64, 79, 87, 60, 86, 99, 73, 94, 88, 83, 68, 99, 75, 98, 68, 93]
calories = [2465, 2945, 2940, 2838, 2010, 2407, 2418, 1856, 2607, 2871, 1920, 2752, 2970, 2263, 3102, 2816, 2739, 1972, 3168, 2400, 3234, 2040, 2976]
statistic, pvalue = pearsonr(kg, calories)
if pvalue >= 0.05:
print("There is a correlation between the values.")
else:
print("There is not a correlation between the values.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is not a correlation between the values. The statistic is: 0.9619689837547747, the p-value is: 2.604391095408313e-13
Skewness¶
Skewness is a measure of the asymmetry of a distribution.
data = np.random.normal(loc=20, scale=5, size=70)
distplot = sns.distplot(data, color='green')
from scipy.stats import skew
skew(data)
-0.1631136257877783
Shapiro-Wilk¶
This test determines whether a data set is normally distributed.
from scipy.stats import shapiro
data = np.random.normal(loc = 20, scale = 5, size=150) # mean 20, std. dev 5, 150 samples
statistic, pvalue= shapiro(data)
if pvalue >= 0.05:
print("Data is normally distibuted.")
else:
print("Data is not normally distibuted.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
Data is normally distibuted. The statistic is: 0.9940446615219116, the p-value is: 0.7971838116645813
T-test¶
Independent (Student)
This test compares the means of two independent groups (assuming equal variances).
from scipy.stats import ttest_ind
height_female = np.random.normal(loc=170, scale=6, size=10) # mean 170, std. dev 6, 10 samples
height_male = np.random.normal(loc=163, scale=5, size=10) # mean 163, std. dev 5, 10 samples
statistic, pvalue = ttest_ind(height_female, height_male)
if pvalue >= 0.05:
print("There is no significant difference between the means.")
else:
print("There is a significant difference between the means.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is a significant difference between the means. The statistic is: 2.326480593066605, the p-value is: 0.03187001618060791
Independent (Welch)
This test compares the means of two independent groups (without assuming equal variances).
from scipy.stats import ttest_ind
height_female = [175, 174, 174, 168, 166, 169, 172, 182, 169, 171, 175]
height_male = [181, 183, 178, 178, 171, 169, 173, 191, 168, 174, 177]
statistic, pvalue = ttest_ind(height_female, height_male, equal_var = False)
if pvalue >= 0.05:
print("There is no significant difference between the groups.")
else:
print("There is a significant difference between the groups.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is no significant difference between the groups. The statistic is: -1.7940948120062388, the p-value is: 0.09031762651932743
Related
This test compares two means where one sample can be paired with observations in another sample.
from scipy.stats import ttest_rel
before = [20, 17, 12, 25, 21, 23, 19, 15, 15, 16, 22]
after = [22, 19, 17, 26, 21, 25, 25, 20, 18, 15, 23]
statistic, pvalue = ttest_rel(before, after)
if pvalue >= 0.05:
print("There is no significant difference between the groups.")
else:
print("There is a significant difference between the groups.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is a significant difference between the groups. The statistic is: -3.557973584775827, the p-value is: 0.005198756772396905
Wilcoxon Sign-Ranked Test¶
This test determines whether the mean values of two dependent groups differ significantly from each other.
from scipy.stats import wilcoxon
measure1 = [14.3, 13.1, 12.1, 12.6, 9.5, 12.6, 8.1, 8.9, 9.0, 8.3, 7.9, 8.1, 13.4]
measure2 = [12.7, 11.1, 15.3, 12.7, 10.5, 15.6, 11.2, 14.2, 16.3, 15.5, 19.9, 20.4, 9.8]
statistic, pvalue = wilcoxon(measure1, measure2)
if pvalue >= 0.05:
print("There is no significant difference between the measures.")
else:
print("There is a significant difference between the measures.")
print("The statistic is: " + str(statistic) + ", the p-value is: " + str(pvalue))
There is a significant difference between the measures. The statistic is: 15.0, the p-value is: 0.03271484375