## 
## Attaching package: 'crayon'
## The following object is masked _by_ '.GlobalEnv':
## 
##     num_colors

Preparation

Download the files counts45.rds, covar45.rds and annot45.rds from the github repository. These RDS files are contain, respectively, the counts, the covariates and gene annotation for 45 samples from the data set GSE156063. These RDS files contain R objects saved with the saveRDS function.

The only reason we use the RDS files is that they are smaller than the corresponding TSV/CSV text files. You can load them as follows:

counts <- readRDS("counts45.rds")
covar  <- readRDS("covar45.rds")
annot  <- readRDS("annot45.rds")

## alternatively, download the .txt files (counts45.txt etc) and read them as follows:
counts <- read.table("counts45.txt")
covar  <- read.table("covar45.txt")
annot  <- read.table("annot45.txt")

There are three groups of data in the covariate file.

# install with install.packages("colorDF")
library(colorDF)
summary_colorDF(covar)
## # Color data frame (class colorDF) 5 x 12:
##   │Col          │Class│NAs  │unique│Summary                               
##  1label        <chr>    0    45All values unique                     
##  2filename     <chr>    0    45All values unique                     
##  3md5          <chr>    0    45All values unique                     
##  4replicate    <int>    0     11                                     
##  5title        <chr>    0    45All values unique                     
##  6geo_accession<chr>    0    45All values unique                     
##  7age          <dbl>    0    3522 [36 <55> 68] 89                    
##  8disease_state<chr>    0     3no virus: 15, other virus: 15, SC2: 15
##  9gender       <chr>    0     2M: 29, F: 16                          
## 10group        <chr>    0     3no: 15, other: 15, SC2: 15            
## 11sizeFactor   <dbl>    0    450.133 [0.456 <0.978> 1.931] 7.484     
## 12replaceable  <lgl>    0     1TRUE: 45 FALSE: 0                     

Exercise. (10 minutes) Take a look at the covar object. How many females and how many males are there? How many females have COVID-19, and how many males are healthy? What is the average age in the male and female groups? Hint: use the table function.

Differential gene expression analysis

To run a basic gene expression analysis, we need to use the DESeq2 package. The steps are as follows:

  • Create the DESeq2 object with counts and covariates
  • Run the calculations – fit the model
  • Extract the results

First two steps are below:

library(DESeq2)
rownames(covar) <- covar$label
ds2 <- DESeqDataSetFromMatrix(countData=counts, 
                              colData=covar,
                              design= ~ group)
ds2 <- DESeq(ds2)

The ds2 contains now the fitted model. We now have to get the results, but for that, we need to know the names of the results:

resultsNames(ds2)
## [1] "Intercept"         "group_other_vs_no" "group_SC2_vs_no"
res_sc2 <- results(ds2, name="group_SC2_vs_no")
res_sc2 <- as.data.frame(res_sc2)
print_colorDF(head(res_sc2))
## # Data frame like object (class data.frame) 6 x 6:
##                │baseMean│log2FoldChange│lfcSE│stat  │pvalue │padj  
## ENSG00000000003     370         0.509 0.41 1.2550.209  0.8098
## ENSG00000000419      60         0.180 0.32 0.5540.579  0.9538
## ENSG00000000460      56         0.278 0.34 0.8230.410  0.9234
## ENSG00000000938     452        -1.630 0.84-1.9320.053  0.5548
## ENSG00000000971     277         1.581 0.35 4.5525.3e-060.0023
## ENSG00000001036     116        -0.018 0.28-0.0620.950  0.9920

This is not very interesting, unless we merge it with the annotation table and order by p-values. The row names of res_sc2 correspond to the column “ENSEMBL” of annot:

## merge by ENSEMBL column from annot and rownames ("0")
## from res
res_sc2 <- merge(annot, res_sc2, by.x="ENSEMBL", by.y=0) %>%
  arrange(pvalue)
knitr::kable(head(res_sc2))
ENSEMBL SYMBOL GENENAME baseMean log2FoldChange lfcSE stat pvalue padj
ENSG00000126709 IFI6 interferon alpha inducible protein 6 676.6069 3.019484 0.4997842 6.041576 0 1.22e-05
ENSG00000138642 HERC6 HECT and RLD domain containing E3 ubiquitin protein ligase family member 6 385.4486 2.539668 0.4306000 5.897974 0 1.22e-05
ENSG00000169245 CXCL10 C-X-C motif chemokine ligand 10 460.9647 5.207046 0.8831769 5.895813 0 1.22e-05
ENSG00000137628 DDX60 DExD/H-box helicase 60 635.5409 2.361766 0.4050831 5.830325 0 1.22e-05
ENSG00000130032 PRRG3 proline rich and Gla domain 3 682.5724 -3.948837 0.6778485 -5.825545 0 1.22e-05
ENSG00000137959 IFI44L interferon induced protein 44 like 1517.2810 2.720096 0.4678168 5.814449 0 1.22e-05

Exercise. (10 minutes). How many genes are significant at padj < 0.05? How many at padj < 0.01? (padj is the false discovery rate from the Benjamini-Hochberg procedure). How many are significant at padj < 0.05 and abs(log2FoldChange) > 1?

Exercise. (15 minutes). Repeat the above steps for the other comparison (other virus vs. no virus) generating a suitable data frame. Merge the two result tables and plot the log2 fold changes from one comparison vs the log2 fold changes from the other comparison. What do you notice?

Exercise. (extra) The model ~ group fits the data using group as predictor. To find genes that are related to age, you would use a model like ~ age. Using DESeq2, find genes that show a significant association with age (repeat the steps above with a new model).

Visualizing the results

To visualize the results, we need to first calculate the log-counts per million. One could use here the cpm function from the package edgeR, but we will do it manually. We need, however, to add a small value to all results to avoid log2(0).

counts <- as.matrix(counts)
cpm <- counts + 2
lib_sizes <- colSums(cpm) / 1e6 # library sizes in millions
## a neat trick to divide a matrix by a row
cpm <- t(t(cpm) / lib_sizes)
cpm <- log2(cpm)

Now we can plot the top results.

par(mfrow=c(1,2))
id <- "ENSG00000126709" # IFI6
boxplot(counts[id, ] ~ covar$group)
boxplot(cpm[id, ] ~ covar$group)

Exercise. (5min) Choose another gene and plot it.

To create a heatmap, we can use the very simple heatmap function. We will select the top 25 genes for that:

sel <- res_sc2$ENSEMBL[1:25]
x <- cpm[sel, ]
colnames(x) <- covar$group
rownames(x) <- annot$SYMBOL[ match(sel, annot$ENSEMBL) ]
heatmap(x)

Exercise. (15min) Create PCA based on the CPM of the top 500 genes (see practicals 5 and 7).

vars <- apply(cpm, 1, var)
sel <- order(vars, decreasing = TRUE)[1:500]
mtx <- cpm[ sel, ]
pca <- prcomp(t(mtx), scale.=TRUE)
library(ggplot2)
df <- data.frame(covar, pca$x)
ggplot(df, aes(x=PC3, y=PC4, color=group)) + geom_point()

Gene set enrichment analysis

Gene set enrichment

Gene set enrichment analysis with tmod is as simple as this:

library(tmod)
res_tmod <- tmodCERNOtest(res_sc2$SYMBOL)
res_tmod
## # tmod report (class tmodReport) 8 x 69:
## # (Showing rows 1 - 30 out of 69)
##          │ID       │Title                                              │cerno│N1   │AUC  │cES  
##   DC.M3.4  DC.M3.4Interferon                                           331   43 0.90  3.8
##   DC.M1.2  DC.M1.2Interferon                                           250   24 0.97  5.2
##    LI.M75   LI.M75antiviral IFN signature                              176   20 0.89  4.4
##  DC.M4.13 DC.M4.13Inflammation                                         280   59 0.85  2.4
##   LI.M127  LI.M127type I interferon response                           118   12 0.93  4.9
##   LI.M165  LI.M165enriched in activated dendritic cells (II)           187   32 0.77  2.9
##   DC.M3.2  DC.M3.2Inflammation                                         360   94 0.72  1.9
##  LI.M11.0 LI.M11.0enriched in monocytes (II)                           513  157 0.68  1.6
##    LI.M67   LI.M67activated dendritic cells                             89   11 0.85  4.0
##   LI.M150  LI.M150innate antiviral response                             78    9 0.90  4.3
##    LI.M68   LI.M68RIG-1 like receptor signaling                         74    9 0.91  4.1
##   LI.M4.0  LI.M4.0cell cycle and transcription                         781  286 0.60  1.4
##     LI.S4    LI.S4Monocyte surface signature                           252   70 0.72  1.8
## LI.M111.1LI.M111.1viral sensing & immunity; IRF2 targets network (II)   72    9 0.86  4.0
##  LI.M37.0 LI.M37.0immune activation - generic cluster                  682  246 0.60  1.4
##  LI.M27.0 LI.M27.0chemokine cluster (I)                                 94   17 0.75  2.8
##    LI.S11   LI.S11Activated (LPS) dendritic cell surface signature     129   29 0.72  2.2
##  DC.M5.14 DC.M5.14Undetermined                                         160   43 0.73  1.9
##    LI.M13   LI.M13innate activation by cytosolic DNA sensing            63   10 0.80  3.1
##   DC.M5.7  DC.M5.7Inflammation                                         195   57 0.71  1.7
##  LI.M27.1 LI.M27.1chemokine cluster (II)                                68   12 0.74  2.8
##    LI.M20   LI.M20AP-1 transcription factor network                     72   14 0.87  2.6
## LI.M118.0LI.M118.0enriched in monocytes (IV)                           157   44 0.68  1.8
##    LI.M16   LI.M16TLR and inflammatory signaling                       153   43 0.72  1.8
##  LI.M37.1 LI.M37.1enriched in neutrophils (I)                          156   44 0.74  1.8
##   DC.M4.6  DC.M4.6Inflammation                                         176   52 0.70  1.7
## LI.M112.0LI.M112.0complement activation (I)                             75   16 0.68  2.3
## LI.M112.1LI.M112.1complement activation (II)                            47    8 0.74  2.9
##   DC.M5.1  DC.M5.1Inflammation                                         274   96 0.60  1.4
##  DC.M9.24 DC.M9.24Undetermined                                          99   26 0.64  1.9
##          │P.Value│adj.P.Val
##   DC.M3.4< 2e-16  1.2e-27
##   DC.M1.2< 2e-16  1.5e-26
##    LI.M75< 2e-16  1.3e-16
##  DC.M4.133.3e-15  5.0e-13
##   LI.M1272.5e-14  3.1e-12
##   LI.M1655.5e-14  5.5e-12
##   DC.M3.27.1e-13  6.2e-11
##  LI.M11.09.8e-12  7.4e-10
##    LI.M675.5e-10  3.7e-08
##   LI.M1502.0e-09  1.2e-07
##    LI.M681.0e-08  5.6e-07
##   LI.M4.01.2e-08  6.3e-07
##     LI.S41.9e-08  9.0e-07
## LI.M111.12.3e-08  1.0e-06
##  LI.M37.02.6e-08  1.1e-06
##  LI.M27.01.5e-07  5.6e-06
##    LI.S112.9e-07  1.0e-05
##  DC.M5.142.1e-06  7.1e-05
##    LI.M132.9e-06  9.1e-05
##   DC.M5.73.6e-06  1.1e-04
##  LI.M27.14.3e-06  1.2e-04
##    LI.M209.1e-06  2.5e-04
## LI.M118.09.4e-06  2.5e-04
##    LI.M161.2e-05  3.0e-04
##  LI.M37.11.2e-05  3.0e-04
##   DC.M4.61.5e-05  3.4e-04
## LI.M112.02.7e-05  6.0e-04
## LI.M112.17.2e-05  1.6e-03
##   DC.M5.19.4e-05  1.9e-03
##  DC.M9.249.5e-05  1.9e-03

Exercise. (20min) Run the enrichment analysis for the other comparison. Compare the results of the enrichment (AUC, p-values) on a plot.

ORA (Overrepresentation Analysis)

We will select genes from a single gene set and test them using a \(\chi^2\) test.

degs <- abs(res_sc2$log2FoldChange) > 1 & res_sc2$padj < 0.05
degs <- ifelse(degs, "DEG", "NS")
data(tmod)
ifn <- tmod$MODULES2GENES[["LI.M75"]]
ings <- res_sc2$SYMBOL %in% ifn
ings <- ifelse(ings, "IFN", "Other")
table(ings, degs)
chisq.test(table(ings, degs))

Other visualizations

Evidence plot

evidencePlot(res_sc2$SYMBOL, m="LI.M75", gene.labels = TRUE)

Panel plot

This one is a bit more complex.

res <- list()
res$sc2 <- results(ds2, name="group_SC2_vs_no")
res$sc2 <- as.data.frame(res$sc2)
res$other <- results(ds2, name="group_other_vs_no")
res$other <- as.data.frame(res$other)

res$sc2 <- merge(annot, res$sc2, by.x="ENSEMBL", by.y=0) %>%
  arrange(pvalue)
res$other <- merge(annot, res$other, by.x="ENSEMBL", by.y=0) %>%
  arrange(pvalue)

tmod_res <- lapply(res, function(x) tmodCERNOtest(x$SYMBOL))

tmodPanelPlot(tmod_res)