In [1]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mygene
import warnings
warnings.filterwarnings('ignore')
Load data¶
In [2]:
df_counts = pd.read_excel('counts and metadata.xlsx', sheet_name='counts')
df_counts = df_counts.set_index('Gene_id')
df_counts = df_counts.T
df_counts.head()
Out[2]:
| Gene_id | ENSG00000284662 | ENSG00000186827 | ENSG00000186891 | ENSG00000160072 | ENSG00000041988 | ENSG00000260179 | ENSG00000234396 | ENSG00000225972 | ENSG00000224315 | ENSG00000198744 | ... | ENSG00000276351 | ENSG00000276345 | ENSG00000273532 | ENSG00000275063 | ENSG00000277856 | ENSG00000271254 | ENSG00000275987 | ENSG00000268674 | ENSG00000277475 | ENSG00000275405 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Ctr_s1 | 0 | 0 | 1 | 361 | 118 | 2 | 0 | 6 | 0 | 1 | ... | 0 | 20 | 0 | 0 | 0 | 56 | 0 | 0 | 0 | 0 |
| Ctr_s3 | 0 | 0 | 0 | 427 | 156 | 0 | 0 | 2 | 0 | 494 | ... | 0 | 0 | 0 | 0 | 0 | 55 | 0 | 0 | 0 | 0 |
| Ctr_s8 | 0 | 1 | 0 | 416 | 124 | 1 | 0 | 3 | 0 | 6 | ... | 0 | 0 | 0 | 0 | 0 | 54 | 1 | 0 | 0 | 0 |
| Ctr_s11 | 0 | 1 | 0 | 549 | 133 | 2 | 0 | 4 | 0 | 4 | ... | 0 | 0 | 0 | 0 | 0 | 76 | 0 | 0 | 0 | 0 |
| RS_s7 | 0 | 4 | 0 | 328 | 101 | 0 | 0 | 8 | 1 | 624 | ... | 0 | 0 | 0 | 0 | 0 | 47 | 0 | 0 | 0 | 0 |
5 rows × 60663 columns
In [3]:
df_metadata = pd.read_excel('counts and metadata.xlsx', sheet_name='meta')
df_metadata = df_metadata.set_index('Sample')
df_metadata
Out[3]:
| Condition | |
|---|---|
| Sample | |
| Ctr_s1 | Control |
| Ctr_s3 | Control |
| Ctr_s8 | Control |
| Ctr_s11 | Control |
| RS_s7 | Treatment |
| RS_s10 | Treatment |
| RS_s13 | Treatment |
| RS_s19 | Treatment |
Filter data¶
In [4]:
df_counts = df_counts.loc[:, ~((df_counts == 0) | df_counts.isna()).all()]
df_counts
Out[4]:
| Gene_id | ENSG00000186827 | ENSG00000186891 | ENSG00000160072 | ENSG00000041988 | ENSG00000260179 | ENSG00000225972 | ENSG00000224315 | ENSG00000198744 | ENSG00000228037 | ENSG00000142611 | ... | ENSG00000278673 | ENSG00000278704 | ENSG00000277400 | ENSG00000276256 | ENSG00000273748 | ENSG00000278817 | ENSG00000278384 | ENSG00000276345 | ENSG00000271254 | ENSG00000275987 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Ctr_s1 | 0 | 1 | 361 | 118 | 2 | 6 | 0 | 1 | 0 | 73 | ... | 1 | 0 | 4 | 74 | 26 | 0 | 2 | 20 | 56 | 0 |
| Ctr_s3 | 0 | 0 | 427 | 156 | 0 | 2 | 0 | 494 | 0 | 24 | ... | 1 | 0 | 0 | 16 | 50 | 0 | 7 | 0 | 55 | 0 |
| Ctr_s8 | 1 | 0 | 416 | 124 | 1 | 3 | 0 | 6 | 0 | 54 | ... | 0 | 0 | 1 | 68 | 37 | 1 | 6 | 0 | 54 | 1 |
| Ctr_s11 | 1 | 0 | 549 | 133 | 2 | 4 | 0 | 4 | 0 | 14 | ... | 0 | 0 | 3 | 28 | 15 | 0 | 5 | 0 | 76 | 0 |
| RS_s7 | 4 | 0 | 328 | 101 | 0 | 8 | 1 | 624 | 0 | 5 | ... | 1 | 0 | 1 | 15 | 40 | 1 | 8 | 0 | 47 | 0 |
| RS_s10 | 0 | 2 | 174 | 56 | 1 | 1 | 0 | 0 | 1 | 11 | ... | 0 | 0 | 0 | 21 | 20 | 0 | 2 | 12 | 67 | 0 |
| RS_s13 | 0 | 2 | 246 | 85 | 0 | 3 | 0 | 4 | 1 | 17 | ... | 0 | 2 | 3 | 28 | 30 | 1 | 2 | 0 | 49 | 0 |
| RS_s19 | 3 | 2 | 202 | 89 | 0 | 5 | 0 | 3 | 0 | 1 | ... | 0 | 0 | 0 | 16 | 13 | 0 | 1 | 0 | 53 | 0 |
8 rows × 31378 columns
Perform analysis and add gene symbol¶
In [5]:
dds = DeseqDataSet(
counts=df_counts,
metadata=df_metadata,
design_factors="Condition"
)
dds.deseq2()
Fitting size factors... ... done in 0.05 seconds.
Using None as control genes, passed at DeseqDataSet initialization
Fitting dispersions... ... done in 12.47 seconds. Fitting dispersion trend curve... ... done in 1.89 seconds. Fitting MAP dispersions... ... done in 17.37 seconds. Fitting LFCs... ... done in 11.05 seconds. Calculating cook's distance... ... done in 0.08 seconds. Replacing 0 outlier genes.
In [6]:
stat_res = DeseqStats(dds, n_cpus=8, contrast = ('Condition','Treatment','Control'))
stat_res.summary()
df_result = stat_res.results_df
Running Wald tests... ... done in 4.96 seconds.
Log2 fold change & Wald test p-value: Condition Treatment vs Control
baseMean log2FoldChange lfcSE stat pvalue \
Gene_id
ENSG00000186827 1.107782 2.107585 1.581738 1.332449 0.182713
ENSG00000186891 1.071147 2.779575 1.635077 1.699966 0.089137
ENSG00000160072 320.481168 -0.492257 0.151983 -3.238909 0.001200
ENSG00000041988 103.354977 -0.282149 0.135449 -2.083063 0.037246
ENSG00000260179 0.758697 -1.646872 1.631440 -1.009460 0.312754
... ... ... ... ... ...
ENSG00000278817 0.352622 1.109326 1.874159 0.591906 0.553914
ENSG00000278384 3.791022 -0.246760 0.691445 -0.356876 0.721185
ENSG00000276345 4.833929 -0.092388 3.260101 -0.028339 0.977392
ENSG00000271254 58.543851 0.317326 0.343451 0.923932 0.355522
ENSG00000275987 0.108747 -0.300576 3.869272 -0.077683 0.938080
padj
Gene_id
ENSG00000186827 NaN
ENSG00000186891 NaN
ENSG00000160072 0.018556
ENSG00000041988 0.203249
ENSG00000260179 NaN
... ...
ENSG00000278817 NaN
ENSG00000278384 0.902359
ENSG00000276345 0.994499
ENSG00000271254 0.678541
ENSG00000275987 NaN
[31378 rows x 6 columns]
In [7]:
mg = mygene.MyGeneInfo()
gene_ids = df_result.index.tolist()
results = mg.querymany(gene_ids, scopes='ensembl.gene', fields='symbol', species='human')
gene_map = {item['query']: item.get('symbol', item['query']) for item in results}
df_result['gene_symbol'] = df_result.index.map(gene_map)
df_result.head()
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
16 input query terms found dup hits: [('ENSG00000228044', 2), ('ENSG00000261600', 2), ('ENSG00000227110', 2), ('ENSG00000215156', 2), ('E
545 input query terms found no hit: ['ENSG00000272482', 'ENSG00000224621', 'ENSG00000225643', 'ENSG00000287400', 'ENSG00000286863', 'ENS
Out[7]:
| baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | gene_symbol | |
|---|---|---|---|---|---|---|---|
| Gene_id | |||||||
| ENSG00000186827 | 1.107782 | 2.107585 | 1.581738 | 1.332449 | 0.182713 | NaN | TNFRSF4 |
| ENSG00000186891 | 1.071147 | 2.779575 | 1.635077 | 1.699966 | 0.089137 | NaN | TNFRSF18 |
| ENSG00000160072 | 320.481168 | -0.492257 | 0.151983 | -3.238909 | 0.001200 | 0.018556 | ATAD3B |
| ENSG00000041988 | 103.354977 | -0.282149 | 0.135449 | -2.083063 | 0.037246 | 0.203249 | THAP3 |
| ENSG00000260179 | 0.758697 | -1.646872 | 1.631440 | -1.009460 | 0.312754 | NaN | ENSG00000260179 |
In [8]:
df_result = df_result[df_result.baseMean >= 10] # remove lowly-expressed genes
df_result_sigs = df_result[(df_result.padj < 0.05) & (abs(df_result.log2FoldChange) > 0.5)] # remove non-differentially expressed genes
print("Filtered dataframe contains " + str(df_result_sigs.shape[0]) + " differentially expressed genes")
df_result_sigs = df_result_sigs.sort_values(by=['stat'])
df_result_sigs.head()
Filtered dataframe contains 1078 differentially expressed genes
Out[8]:
| baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | gene_symbol | |
|---|---|---|---|---|---|---|---|
| Gene_id | |||||||
| ENSG00000213281 | 1158.402814 | -0.914722 | 0.084207 | -10.862752 | 1.734423e-27 | 1.381381e-23 | NRAS |
| ENSG00000071626 | 1277.372016 | -0.654556 | 0.069358 | -9.437342 | 3.823564e-21 | 1.522639e-17 | DAZAP1 |
| ENSG00000164163 | 1484.253437 | -0.514938 | 0.055423 | -9.291121 | 1.526721e-20 | 4.053188e-17 | ABCE1 |
| ENSG00000108559 | 427.968377 | -0.710476 | 0.081458 | -8.722029 | 2.732594e-18 | 6.218212e-15 | NUP88 |
| ENSG00000182963 | 229.003182 | -1.313308 | 0.154161 | -8.519080 | 1.608254e-17 | 3.202235e-14 | GJC1 |
Plot data¶
In [9]:
# Thresholds for significance
padj_threshold = 0.05
lfc_threshold = 1.0
# Calculate -log10(padj)
df_result['-log10padj'] = -np.log10(df_result['padj'])
# Categorise genes as upgregulated/downregulated and define colours for these
df_result['sig_status'] = 'Not Significant'
df_result.loc[(df_result['padj'] < padj_threshold) & (df_result['log2FoldChange'] > lfc_threshold), 'sig_status'] = 'Upregulated'
df_result.loc[(df_result['padj'] < padj_threshold) & (df_result['log2FoldChange'] < -lfc_threshold), 'sig_status'] = 'Downregulated'
colours = {'Not Significant': 'grey', 'Upregulated': 'red', 'Downregulated': 'blue'}
# Create the plot
plt.figure(figsize=(8, 6))
for status, color in colours.items():
subset = df_result[df_result['sig_status'] == status]
plt.scatter(subset['log2FoldChange'], subset['-log10padj'],c=color, label=status, alpha=0.6, s=15)
# Add threshold lines
plt.axhline(-np.log10(padj_threshold), color='black', linestyle='--', linewidth=0.8)
plt.axvline(lfc_threshold, color='black', linestyle='--', linewidth=0.8)
plt.axvline(-lfc_threshold, color='black', linestyle='--', linewidth=0.8)
# Add labels/legend
plt.xlabel('log2 Fold Change', fontsize=12)
plt.ylabel('-log10(Adjusted p-value)', fontsize=12)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()