In [1]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mygene

import warnings
warnings.filterwarnings('ignore')

Load data¶

In [2]:
df_counts = pd.read_excel('counts and metadata.xlsx', sheet_name='counts')
df_counts = df_counts.set_index('Gene_id')
df_counts = df_counts.T

df_counts.head()
Out[2]:
Gene_id ENSG00000284662 ENSG00000186827 ENSG00000186891 ENSG00000160072 ENSG00000041988 ENSG00000260179 ENSG00000234396 ENSG00000225972 ENSG00000224315 ENSG00000198744 ... ENSG00000276351 ENSG00000276345 ENSG00000273532 ENSG00000275063 ENSG00000277856 ENSG00000271254 ENSG00000275987 ENSG00000268674 ENSG00000277475 ENSG00000275405
Ctr_s1 0 0 1 361 118 2 0 6 0 1 ... 0 20 0 0 0 56 0 0 0 0
Ctr_s3 0 0 0 427 156 0 0 2 0 494 ... 0 0 0 0 0 55 0 0 0 0
Ctr_s8 0 1 0 416 124 1 0 3 0 6 ... 0 0 0 0 0 54 1 0 0 0
Ctr_s11 0 1 0 549 133 2 0 4 0 4 ... 0 0 0 0 0 76 0 0 0 0
RS_s7 0 4 0 328 101 0 0 8 1 624 ... 0 0 0 0 0 47 0 0 0 0

5 rows × 60663 columns

In [3]:
df_metadata = pd.read_excel('counts and metadata.xlsx', sheet_name='meta')
df_metadata = df_metadata.set_index('Sample')

df_metadata
Out[3]:
Condition
Sample
Ctr_s1 Control
Ctr_s3 Control
Ctr_s8 Control
Ctr_s11 Control
RS_s7 Treatment
RS_s10 Treatment
RS_s13 Treatment
RS_s19 Treatment

Filter data¶

In [4]:
df_counts = df_counts.loc[:, ~((df_counts == 0) | df_counts.isna()).all()]

df_counts
Out[4]:
Gene_id ENSG00000186827 ENSG00000186891 ENSG00000160072 ENSG00000041988 ENSG00000260179 ENSG00000225972 ENSG00000224315 ENSG00000198744 ENSG00000228037 ENSG00000142611 ... ENSG00000278673 ENSG00000278704 ENSG00000277400 ENSG00000276256 ENSG00000273748 ENSG00000278817 ENSG00000278384 ENSG00000276345 ENSG00000271254 ENSG00000275987
Ctr_s1 0 1 361 118 2 6 0 1 0 73 ... 1 0 4 74 26 0 2 20 56 0
Ctr_s3 0 0 427 156 0 2 0 494 0 24 ... 1 0 0 16 50 0 7 0 55 0
Ctr_s8 1 0 416 124 1 3 0 6 0 54 ... 0 0 1 68 37 1 6 0 54 1
Ctr_s11 1 0 549 133 2 4 0 4 0 14 ... 0 0 3 28 15 0 5 0 76 0
RS_s7 4 0 328 101 0 8 1 624 0 5 ... 1 0 1 15 40 1 8 0 47 0
RS_s10 0 2 174 56 1 1 0 0 1 11 ... 0 0 0 21 20 0 2 12 67 0
RS_s13 0 2 246 85 0 3 0 4 1 17 ... 0 2 3 28 30 1 2 0 49 0
RS_s19 3 2 202 89 0 5 0 3 0 1 ... 0 0 0 16 13 0 1 0 53 0

8 rows × 31378 columns

Perform analysis and add gene symbol¶

In [5]:
dds = DeseqDataSet(
    counts=df_counts, 
    metadata=df_metadata, 
    design_factors="Condition"
)

dds.deseq2()
Fitting size factors...
... done in 0.05 seconds.

Using None as control genes, passed at DeseqDataSet initialization
Fitting dispersions...
... done in 12.47 seconds.

Fitting dispersion trend curve...
... done in 1.89 seconds.

Fitting MAP dispersions...
... done in 17.37 seconds.

Fitting LFCs...
... done in 11.05 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 0 outlier genes.

In [6]:
stat_res = DeseqStats(dds, n_cpus=8, contrast = ('Condition','Treatment','Control'))
stat_res.summary()
df_result = stat_res.results_df
Running Wald tests...
... done in 4.96 seconds.

Log2 fold change & Wald test p-value: Condition Treatment vs Control
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Gene_id                                                                     
ENSG00000186827    1.107782        2.107585  1.581738  1.332449  0.182713   
ENSG00000186891    1.071147        2.779575  1.635077  1.699966  0.089137   
ENSG00000160072  320.481168       -0.492257  0.151983 -3.238909  0.001200   
ENSG00000041988  103.354977       -0.282149  0.135449 -2.083063  0.037246   
ENSG00000260179    0.758697       -1.646872  1.631440 -1.009460  0.312754   
...                     ...             ...       ...       ...       ...   
ENSG00000278817    0.352622        1.109326  1.874159  0.591906  0.553914   
ENSG00000278384    3.791022       -0.246760  0.691445 -0.356876  0.721185   
ENSG00000276345    4.833929       -0.092388  3.260101 -0.028339  0.977392   
ENSG00000271254   58.543851        0.317326  0.343451  0.923932  0.355522   
ENSG00000275987    0.108747       -0.300576  3.869272 -0.077683  0.938080   

                     padj  
Gene_id                    
ENSG00000186827       NaN  
ENSG00000186891       NaN  
ENSG00000160072  0.018556  
ENSG00000041988  0.203249  
ENSG00000260179       NaN  
...                   ...  
ENSG00000278817       NaN  
ENSG00000278384  0.902359  
ENSG00000276345  0.994499  
ENSG00000271254  0.678541  
ENSG00000275987       NaN  

[31378 rows x 6 columns]
In [7]:
mg = mygene.MyGeneInfo()
gene_ids = df_result.index.tolist()
results = mg.querymany(gene_ids, scopes='ensembl.gene', fields='symbol', species='human')
gene_map = {item['query']: item.get('symbol', item['query']) for item in results}
df_result['gene_symbol'] = df_result.index.map(gene_map)

df_result.head()
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
16 input query terms found dup hits:	[('ENSG00000228044', 2), ('ENSG00000261600', 2), ('ENSG00000227110', 2), ('ENSG00000215156', 2), ('E
545 input query terms found no hit:	['ENSG00000272482', 'ENSG00000224621', 'ENSG00000225643', 'ENSG00000287400', 'ENSG00000286863', 'ENS
Out[7]:
baseMean log2FoldChange lfcSE stat pvalue padj gene_symbol
Gene_id
ENSG00000186827 1.107782 2.107585 1.581738 1.332449 0.182713 NaN TNFRSF4
ENSG00000186891 1.071147 2.779575 1.635077 1.699966 0.089137 NaN TNFRSF18
ENSG00000160072 320.481168 -0.492257 0.151983 -3.238909 0.001200 0.018556 ATAD3B
ENSG00000041988 103.354977 -0.282149 0.135449 -2.083063 0.037246 0.203249 THAP3
ENSG00000260179 0.758697 -1.646872 1.631440 -1.009460 0.312754 NaN ENSG00000260179
In [8]:
df_result = df_result[df_result.baseMean >= 10] # remove lowly-expressed genes
df_result_sigs = df_result[(df_result.padj < 0.05) & (abs(df_result.log2FoldChange) > 0.5)] # remove non-differentially expressed genes
print("Filtered dataframe contains " + str(df_result_sigs.shape[0]) + " differentially expressed genes") 
df_result_sigs = df_result_sigs.sort_values(by=['stat'])

df_result_sigs.head()
Filtered dataframe contains 1078 differentially expressed genes
Out[8]:
baseMean log2FoldChange lfcSE stat pvalue padj gene_symbol
Gene_id
ENSG00000213281 1158.402814 -0.914722 0.084207 -10.862752 1.734423e-27 1.381381e-23 NRAS
ENSG00000071626 1277.372016 -0.654556 0.069358 -9.437342 3.823564e-21 1.522639e-17 DAZAP1
ENSG00000164163 1484.253437 -0.514938 0.055423 -9.291121 1.526721e-20 4.053188e-17 ABCE1
ENSG00000108559 427.968377 -0.710476 0.081458 -8.722029 2.732594e-18 6.218212e-15 NUP88
ENSG00000182963 229.003182 -1.313308 0.154161 -8.519080 1.608254e-17 3.202235e-14 GJC1

Plot data¶

In [9]:
# Thresholds for significance
padj_threshold = 0.05
lfc_threshold = 1.0

# Calculate -log10(padj)
df_result['-log10padj'] = -np.log10(df_result['padj'])

# Categorise genes as upgregulated/downregulated and define colours for these
df_result['sig_status'] = 'Not Significant'
df_result.loc[(df_result['padj'] < padj_threshold) & (df_result['log2FoldChange'] > lfc_threshold), 'sig_status'] = 'Upregulated'
df_result.loc[(df_result['padj'] < padj_threshold) & (df_result['log2FoldChange'] < -lfc_threshold), 'sig_status'] = 'Downregulated'
colours = {'Not Significant': 'grey', 'Upregulated': 'red', 'Downregulated': 'blue'}

# Create the plot
plt.figure(figsize=(8, 6))
for status, color in colours.items():
    subset = df_result[df_result['sig_status'] == status]
    plt.scatter(subset['log2FoldChange'], subset['-log10padj'],c=color, label=status, alpha=0.6, s=15)

# Add threshold lines
plt.axhline(-np.log10(padj_threshold), color='black', linestyle='--', linewidth=0.8)
plt.axvline(lfc_threshold, color='black', linestyle='--', linewidth=0.8)
plt.axvline(-lfc_threshold, color='black', linestyle='--', linewidth=0.8)

# Add labels/legend
plt.xlabel('log2 Fold Change', fontsize=12)
plt.ylabel('-log10(Adjusted p-value)', fontsize=12)
plt.legend()

plt.grid(alpha=0.3)
plt.tight_layout()
No description has been provided for this image