from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mygene

import warnings
warnings.filterwarnings('ignore')

df_counts = pd.read_excel('counts and metadata.xlsx', sheet_name='counts')
df_counts = df_counts.set_index('Gene_id')
df_counts = df_counts.T

df_counts.head()

df_metadata = pd.read_excel('counts and metadata.xlsx', sheet_name='meta')
df_metadata = df_metadata.set_index('Sample')

df_metadata

df_counts = df_counts.loc[:, ~((df_counts == 0) | df_counts.isna()).all()]

df_counts

dds = DeseqDataSet(
    counts=df_counts, 
    metadata=df_metadata, 
    design_factors="Condition"
)

dds.deseq2()

Fitting size factors...
... done in 0.05 seconds.

Using None as control genes, passed at DeseqDataSet initialization

Fitting dispersions...
... done in 12.47 seconds.

Fitting dispersion trend curve...
... done in 1.89 seconds.

Fitting MAP dispersions...
... done in 17.37 seconds.

Fitting LFCs...
... done in 11.05 seconds.

Calculating cook's distance...
... done in 0.08 seconds.

Replacing 0 outlier genes.

stat_res = DeseqStats(dds, n_cpus=8, contrast = ('Condition','Treatment','Control'))
stat_res.summary()
df_result = stat_res.results_df

Running Wald tests...
... done in 4.96 seconds.

Log2 fold change & Wald test p-value: Condition Treatment vs Control
                   baseMean  log2FoldChange     lfcSE      stat    pvalue  \
Gene_id                                                                     
ENSG00000186827    1.107782        2.107585  1.581738  1.332449  0.182713   
ENSG00000186891    1.071147        2.779575  1.635077  1.699966  0.089137   
ENSG00000160072  320.481168       -0.492257  0.151983 -3.238909  0.001200   
ENSG00000041988  103.354977       -0.282149  0.135449 -2.083063  0.037246   
ENSG00000260179    0.758697       -1.646872  1.631440 -1.009460  0.312754   
...                     ...             ...       ...       ...       ...   
ENSG00000278817    0.352622        1.109326  1.874159  0.591906  0.553914   
ENSG00000278384    3.791022       -0.246760  0.691445 -0.356876  0.721185   
ENSG00000276345    4.833929       -0.092388  3.260101 -0.028339  0.977392   
ENSG00000271254   58.543851        0.317326  0.343451  0.923932  0.355522   
ENSG00000275987    0.108747       -0.300576  3.869272 -0.077683  0.938080   

                     padj  
Gene_id                    
ENSG00000186827       NaN  
ENSG00000186891       NaN  
ENSG00000160072  0.018556  
ENSG00000041988  0.203249  
ENSG00000260179       NaN  
...                   ...  
ENSG00000278817       NaN  
ENSG00000278384  0.902359  
ENSG00000276345  0.994499  
ENSG00000271254  0.678541  
ENSG00000275987       NaN  

[31378 rows x 6 columns]

mg = mygene.MyGeneInfo()
gene_ids = df_result.index.tolist()
results = mg.querymany(gene_ids, scopes='ensembl.gene', fields='symbol', species='human')
gene_map = {item['query']: item.get('symbol', item['query']) for item in results}
df_result['gene_symbol'] = df_result.index.map(gene_map)

df_result.head()

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
16 input query terms found dup hits:	[('ENSG00000228044', 2), ('ENSG00000261600', 2), ('ENSG00000227110', 2), ('ENSG00000215156', 2), ('E
545 input query terms found no hit:	['ENSG00000272482', 'ENSG00000224621', 'ENSG00000225643', 'ENSG00000287400', 'ENSG00000286863', 'ENS

df_result = df_result[df_result.baseMean >= 10] # remove lowly-expressed genes
df_result_sigs = df_result[(df_result.padj < 0.05) & (abs(df_result.log2FoldChange) > 0.5)] # remove non-differentially expressed genes
print("Filtered dataframe contains " + str(df_result_sigs.shape[0]) + " differentially expressed genes") 
df_result_sigs = df_result_sigs.sort_values(by=['stat'])

df_result_sigs.head()

Filtered dataframe contains 1078 differentially expressed genes

# Thresholds for significance
padj_threshold = 0.05
lfc_threshold = 1.0

# Calculate -log10(padj)
df_result['-log10padj'] = -np.log10(df_result['padj'])

# Categorise genes as upgregulated/downregulated and define colours for these
df_result['sig_status'] = 'Not Significant'
df_result.loc[(df_result['padj'] < padj_threshold) & (df_result['log2FoldChange'] > lfc_threshold), 'sig_status'] = 'Upregulated'
df_result.loc[(df_result['padj'] < padj_threshold) & (df_result['log2FoldChange'] < -lfc_threshold), 'sig_status'] = 'Downregulated'
colours = {'Not Significant': 'grey', 'Upregulated': 'red', 'Downregulated': 'blue'}

# Create the plot
plt.figure(figsize=(8, 6))
for status, color in colours.items():
    subset = df_result[df_result['sig_status'] == status]
    plt.scatter(subset['log2FoldChange'], subset['-log10padj'],c=color, label=status, alpha=0.6, s=15)

# Add threshold lines
plt.axhline(-np.log10(padj_threshold), color='black', linestyle='--', linewidth=0.8)
plt.axvline(lfc_threshold, color='black', linestyle='--', linewidth=0.8)
plt.axvline(-lfc_threshold, color='black', linestyle='--', linewidth=0.8)

# Add labels/legend
plt.xlabel('log2 Fold Change', fontsize=12)
plt.ylabel('-log10(Adjusted p-value)', fontsize=12)
plt.legend()

plt.grid(alpha=0.3)
plt.tight_layout()

	baseMean	log2FoldChange	lfcSE	stat	pvalue	padj	gene_symbol
Gene_id
ENSG00000213281	1158.402814	-0.914722	0.084207	-10.862752	1.734423e-27	1.381381e-23	NRAS
ENSG00000071626	1277.372016	-0.654556	0.069358	-9.437342	3.823564e-21	1.522639e-17	DAZAP1
ENSG00000164163	1484.253437	-0.514938	0.055423	-9.291121	1.526721e-20	4.053188e-17	ABCE1
ENSG00000108559	427.968377	-0.710476	0.081458	-8.722029	2.732594e-18	6.218212e-15	NUP88
ENSG00000182963	229.003182	-1.313308	0.154161	-8.519080	1.608254e-17	3.202235e-14	GJC1

Load data¶

Filter data¶

Perform analysis and add gene symbol¶

Plot data¶

Gene_id	ENSG00000186827	ENSG00000186891	ENSG00000160072	ENSG00000041988	ENSG00000260179	ENSG00000225972	ENSG00000224315	ENSG00000198744	...	ENSG00000276345	ENSG00000271254	ENSG00000275987
Ctr_s1	0	1	361	118	2	6	0	1	...	20	56	0
Ctr_s3	0	0	427	156	0	2	0	494	...	0	55	0
Ctr_s8	1	0	416	124	1	3	0	6	...	0	54	1
Ctr_s11	1	0	549	133	2	4	0	4	...	0	76	0
RS_s7	4	0	328	101	0	8	1	624	...	0	47	0

	Condition
Sample
Ctr_s1	Control
Ctr_s3	Control
Ctr_s8	Control
Ctr_s11	Control
RS_s7	Treatment
RS_s10	Treatment
RS_s13	Treatment
RS_s19	Treatment

	baseMean	log2FoldChange	lfcSE	stat	pvalue	padj	gene_symbol
Gene_id
ENSG00000186827	1.107782	2.107585	1.581738	1.332449	0.182713	NaN	TNFRSF4
ENSG00000186891	1.071147	2.779575	1.635077	1.699966	0.089137	NaN	TNFRSF18
ENSG00000160072	320.481168	-0.492257	0.151983	-3.238909	0.001200	0.018556	ATAD3B
ENSG00000041988	103.354977	-0.282149	0.135449	-2.083063	0.037246	0.203249	THAP3
ENSG00000260179	0.758697	-1.646872	1.631440	-1.009460	0.312754	NaN	ENSG00000260179