##
## Attaching package: 'crayon'
## The following object is masked _by_ '.GlobalEnv':
##
## num_colors
Download the files counts45.rds
,
covar45.rds
and annot45.rds
from the github
repository. These RDS files are contain, respectively, the counts, the
covariates and gene annotation for 45 samples from the data set GSE156063.
These RDS files contain R objects saved with the saveRDS
function.
The only reason we use the RDS files is that they are smaller than the corresponding TSV/CSV text files. You can load them as follows:
counts <- readRDS("counts45.rds")
covar <- readRDS("covar45.rds")
annot <- readRDS("annot45.rds")
## alternatively, download the .txt files (counts45.txt etc) and read them as follows:
counts <- read.table("counts45.txt")
covar <- read.table("covar45.txt")
annot <- read.table("annot45.txt")
There are three groups of data in the covariate file.
# install with install.packages("colorDF")
library(colorDF)
summary_colorDF(covar)
## # Color data frame (class colorDF) 5 x 12:
## │Col │Class│NAs │unique│Summary
## 1│label │<chr>│ 0│ 45│All values unique
## 2│filename │<chr>│ 0│ 45│All values unique
## 3│md5 │<chr>│ 0│ 45│All values unique
## 4│replicate │<int>│ 0│ 1│1
## 5│title │<chr>│ 0│ 45│All values unique
## 6│geo_accession│<chr>│ 0│ 45│All values unique
## 7│age │<dbl>│ 0│ 35│22 [36 <55> 68] 89
## 8│disease_state│<chr>│ 0│ 3│no virus: 15, other virus: 15, SC2: 15
## 9│gender │<chr>│ 0│ 2│M: 29, F: 16
## 10│group │<chr>│ 0│ 3│no: 15, other: 15, SC2: 15
## 11│sizeFactor │<dbl>│ 0│ 45│0.133 [0.456 <0.978> 1.931] 7.484
## 12│replaceable │<lgl>│ 0│ 1│TRUE: 45 FALSE: 0
Exercise. (10 minutes) Take a look at the
covar
object. How many females and how many males are
there? How many females have COVID-19, and how many males are healthy?
What is the average age in the male and female groups? Hint: use the
table
function.
To run a basic gene expression analysis, we need to use the DESeq2 package. The steps are as follows:
First two steps are below:
library(DESeq2)
rownames(covar) <- covar$label
ds2 <- DESeqDataSetFromMatrix(countData=counts,
colData=covar,
design= ~ group)
ds2 <- DESeq(ds2)
The ds2 contains now the fitted model. We now have to get the results, but for that, we need to know the names of the results:
resultsNames(ds2)
## [1] "Intercept" "group_other_vs_no" "group_SC2_vs_no"
res_sc2 <- results(ds2, name="group_SC2_vs_no")
res_sc2 <- as.data.frame(res_sc2)
print_colorDF(head(res_sc2))
## # Data frame like object (class data.frame) 6 x 6:
## │baseMean│log2FoldChange│lfcSE│stat │pvalue │padj
## ENSG00000000003│ 370│ 0.509│ 0.41│ 1.255│0.209 │0.8098
## ENSG00000000419│ 60│ 0.180│ 0.32│ 0.554│0.579 │0.9538
## ENSG00000000460│ 56│ 0.278│ 0.34│ 0.823│0.410 │0.9234
## ENSG00000000938│ 452│ -1.630│ 0.84│-1.932│0.053 │0.5548
## ENSG00000000971│ 277│ 1.581│ 0.35│ 4.552│5.3e-06│0.0023
## ENSG00000001036│ 116│ -0.018│ 0.28│-0.062│0.950 │0.9920
This is not very interesting, unless we merge it with the annotation
table and order by p-values. The row names of res_sc2
correspond to the column “ENSEMBL” of annot
:
## merge by ENSEMBL column from annot and rownames ("0")
## from res
res_sc2 <- merge(annot, res_sc2, by.x="ENSEMBL", by.y=0) %>%
arrange(pvalue)
knitr::kable(head(res_sc2))
ENSEMBL | SYMBOL | GENENAME | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
---|---|---|---|---|---|---|---|---|
ENSG00000126709 | IFI6 | interferon alpha inducible protein 6 | 676.6069 | 3.019484 | 0.4997842 | 6.041576 | 0 | 1.22e-05 |
ENSG00000138642 | HERC6 | HECT and RLD domain containing E3 ubiquitin protein ligase family member 6 | 385.4486 | 2.539668 | 0.4306000 | 5.897974 | 0 | 1.22e-05 |
ENSG00000169245 | CXCL10 | C-X-C motif chemokine ligand 10 | 460.9647 | 5.207046 | 0.8831769 | 5.895813 | 0 | 1.22e-05 |
ENSG00000137628 | DDX60 | DExD/H-box helicase 60 | 635.5409 | 2.361766 | 0.4050831 | 5.830325 | 0 | 1.22e-05 |
ENSG00000130032 | PRRG3 | proline rich and Gla domain 3 | 682.5724 | -3.948837 | 0.6778485 | -5.825545 | 0 | 1.22e-05 |
ENSG00000137959 | IFI44L | interferon induced protein 44 like | 1517.2810 | 2.720096 | 0.4678168 | 5.814449 | 0 | 1.22e-05 |
Exercise. (10 minutes). How many genes are significant at padj < 0.05? How many at padj < 0.01? (padj is the false discovery rate from the Benjamini-Hochberg procedure). How many are significant at padj < 0.05 and abs(log2FoldChange) > 1?
Exercise. (15 minutes). Repeat the above steps for the other comparison (other virus vs. no virus) generating a suitable data frame. Merge the two result tables and plot the log2 fold changes from one comparison vs the log2 fold changes from the other comparison. What do you notice?
Exercise. (extra) The model ~ group
fits the data using group
as predictor. To find genes that
are related to age, you would use a model like ~ age
. Using
DESeq2, find genes that show a significant association with age (repeat
the steps above with a new model).
To visualize the results, we need to first calculate the log-counts
per million. One could use here the cpm
function from the
package edgeR
, but we will do it manually. We need,
however, to add a small value to all results to avoid log2(0).
counts <- as.matrix(counts)
cpm <- counts + 2
lib_sizes <- colSums(cpm) / 1e6 # library sizes in millions
## a neat trick to divide a matrix by a row
cpm <- t(t(cpm) / lib_sizes)
cpm <- log2(cpm)
Now we can plot the top results.
par(mfrow=c(1,2))
id <- "ENSG00000126709" # IFI6
boxplot(counts[id, ] ~ covar$group)
boxplot(cpm[id, ] ~ covar$group)
Exercise. (5min) Choose another gene and plot it.
To create a heatmap, we can use the very simple heatmap
function. We will select the top 25 genes for that:
sel <- res_sc2$ENSEMBL[1:25]
x <- cpm[sel, ]
colnames(x) <- covar$group
rownames(x) <- annot$SYMBOL[ match(sel, annot$ENSEMBL) ]
heatmap(x)
Exercise. (15min) Create PCA based on the CPM of the top 500 genes (see practicals 5 and 7).
vars <- apply(cpm, 1, var)
sel <- order(vars, decreasing = TRUE)[1:500]
mtx <- cpm[ sel, ]
pca <- prcomp(t(mtx), scale.=TRUE)
library(ggplot2)
df <- data.frame(covar, pca$x)
ggplot(df, aes(x=PC3, y=PC4, color=group)) + geom_point()
Gene set enrichment analysis with tmod is as simple as this:
library(tmod)
res_tmod <- tmodCERNOtest(res_sc2$SYMBOL)
res_tmod
## # tmod report (class tmodReport) 8 x 69:
## # (Showing rows 1 - 30 out of 69)
## │ID │Title │cerno│N1 │AUC │cES
## DC.M3.4│ DC.M3.4│Interferon │ 331│ 43│ 0.90│ 3.8
## DC.M1.2│ DC.M1.2│Interferon │ 250│ 24│ 0.97│ 5.2
## LI.M75│ LI.M75│antiviral IFN signature │ 176│ 20│ 0.89│ 4.4
## DC.M4.13│ DC.M4.13│Inflammation │ 280│ 59│ 0.85│ 2.4
## LI.M127│ LI.M127│type I interferon response │ 118│ 12│ 0.93│ 4.9
## LI.M165│ LI.M165│enriched in activated dendritic cells (II) │ 187│ 32│ 0.77│ 2.9
## DC.M3.2│ DC.M3.2│Inflammation │ 360│ 94│ 0.72│ 1.9
## LI.M11.0│ LI.M11.0│enriched in monocytes (II) │ 513│ 157│ 0.68│ 1.6
## LI.M67│ LI.M67│activated dendritic cells │ 89│ 11│ 0.85│ 4.0
## LI.M150│ LI.M150│innate antiviral response │ 78│ 9│ 0.90│ 4.3
## LI.M68│ LI.M68│RIG-1 like receptor signaling │ 74│ 9│ 0.91│ 4.1
## LI.M4.0│ LI.M4.0│cell cycle and transcription │ 781│ 286│ 0.60│ 1.4
## LI.S4│ LI.S4│Monocyte surface signature │ 252│ 70│ 0.72│ 1.8
## LI.M111.1│LI.M111.1│viral sensing & immunity; IRF2 targets network (II)│ 72│ 9│ 0.86│ 4.0
## LI.M37.0│ LI.M37.0│immune activation - generic cluster │ 682│ 246│ 0.60│ 1.4
## LI.M27.0│ LI.M27.0│chemokine cluster (I) │ 94│ 17│ 0.75│ 2.8
## LI.S11│ LI.S11│Activated (LPS) dendritic cell surface signature │ 129│ 29│ 0.72│ 2.2
## DC.M5.14│ DC.M5.14│Undetermined │ 160│ 43│ 0.73│ 1.9
## LI.M13│ LI.M13│innate activation by cytosolic DNA sensing │ 63│ 10│ 0.80│ 3.1
## DC.M5.7│ DC.M5.7│Inflammation │ 195│ 57│ 0.71│ 1.7
## LI.M27.1│ LI.M27.1│chemokine cluster (II) │ 68│ 12│ 0.74│ 2.8
## LI.M20│ LI.M20│AP-1 transcription factor network │ 72│ 14│ 0.87│ 2.6
## LI.M118.0│LI.M118.0│enriched in monocytes (IV) │ 157│ 44│ 0.68│ 1.8
## LI.M16│ LI.M16│TLR and inflammatory signaling │ 153│ 43│ 0.72│ 1.8
## LI.M37.1│ LI.M37.1│enriched in neutrophils (I) │ 156│ 44│ 0.74│ 1.8
## DC.M4.6│ DC.M4.6│Inflammation │ 176│ 52│ 0.70│ 1.7
## LI.M112.0│LI.M112.0│complement activation (I) │ 75│ 16│ 0.68│ 2.3
## LI.M112.1│LI.M112.1│complement activation (II) │ 47│ 8│ 0.74│ 2.9
## DC.M5.1│ DC.M5.1│Inflammation │ 274│ 96│ 0.60│ 1.4
## DC.M9.24│ DC.M9.24│Undetermined │ 99│ 26│ 0.64│ 1.9
## │P.Value│adj.P.Val
## DC.M3.4│< 2e-16│ 1.2e-27
## DC.M1.2│< 2e-16│ 1.5e-26
## LI.M75│< 2e-16│ 1.3e-16
## DC.M4.13│3.3e-15│ 5.0e-13
## LI.M127│2.5e-14│ 3.1e-12
## LI.M165│5.5e-14│ 5.5e-12
## DC.M3.2│7.1e-13│ 6.2e-11
## LI.M11.0│9.8e-12│ 7.4e-10
## LI.M67│5.5e-10│ 3.7e-08
## LI.M150│2.0e-09│ 1.2e-07
## LI.M68│1.0e-08│ 5.6e-07
## LI.M4.0│1.2e-08│ 6.3e-07
## LI.S4│1.9e-08│ 9.0e-07
## LI.M111.1│2.3e-08│ 1.0e-06
## LI.M37.0│2.6e-08│ 1.1e-06
## LI.M27.0│1.5e-07│ 5.6e-06
## LI.S11│2.9e-07│ 1.0e-05
## DC.M5.14│2.1e-06│ 7.1e-05
## LI.M13│2.9e-06│ 9.1e-05
## DC.M5.7│3.6e-06│ 1.1e-04
## LI.M27.1│4.3e-06│ 1.2e-04
## LI.M20│9.1e-06│ 2.5e-04
## LI.M118.0│9.4e-06│ 2.5e-04
## LI.M16│1.2e-05│ 3.0e-04
## LI.M37.1│1.2e-05│ 3.0e-04
## DC.M4.6│1.5e-05│ 3.4e-04
## LI.M112.0│2.7e-05│ 6.0e-04
## LI.M112.1│7.2e-05│ 1.6e-03
## DC.M5.1│9.4e-05│ 1.9e-03
## DC.M9.24│9.5e-05│ 1.9e-03
Exercise. (20min) Run the enrichment analysis for the other comparison. Compare the results of the enrichment (AUC, p-values) on a plot.
We will select genes from a single gene set and test them using a \(\chi^2\) test.
degs <- abs(res_sc2$log2FoldChange) > 1 & res_sc2$padj < 0.05
degs <- ifelse(degs, "DEG", "NS")
data(tmod)
ifn <- tmod$MODULES2GENES[["LI.M75"]]
ings <- res_sc2$SYMBOL %in% ifn
ings <- ifelse(ings, "IFN", "Other")
table(ings, degs)
chisq.test(table(ings, degs))
evidencePlot(res_sc2$SYMBOL, m="LI.M75", gene.labels = TRUE)
This one is a bit more complex.
res <- list()
res$sc2 <- results(ds2, name="group_SC2_vs_no")
res$sc2 <- as.data.frame(res$sc2)
res$other <- results(ds2, name="group_other_vs_no")
res$other <- as.data.frame(res$other)
res$sc2 <- merge(annot, res$sc2, by.x="ENSEMBL", by.y=0) %>%
arrange(pvalue)
res$other <- merge(annot, res$other, by.x="ENSEMBL", by.y=0) %>%
arrange(pvalue)
tmod_res <- lapply(res, function(x) tmodCERNOtest(x$SYMBOL))
tmodPanelPlot(tmod_res)