Chapter 1 Visualization

ggplot2 is a powerful package for data visualization in R. Most of the figures in this chapter are plotted using ggplot2. Other great packages such as VennDiagram, UpSetR, and ComplexHeatmap are used to generate special figures like Venn diagram, UpSet, and Heatmap, etc.

1.1 Boxplot

### Add stats manually, 3 groups

dataForBoxPlot <- readRDS('data/dataForBoxPlot.rds')

#dataForBoxPlot <- filter(dataForBoxPlot,
#                         HistologyGrade %in% c('Normal','Mild dysplasia','Moderate dysplasia'))


dataForBoxPlot <- dataForBoxPlot[dataForBoxPlot$HistologyGrade 
                                 %in% c('Normal','Mild dysplasia','Moderate dysplasia'),]


dataForBoxPlot$expr <- dataForBoxPlot$DSG3
dataForBoxPlot$group <- factor(dataForBoxPlot$HistologyGrade,
                               levels=c('Normal','Mild dysplasia','Moderate dysplasia'))

t <- pairwise.wilcox.test(dataForBoxPlot$expr, dataForBoxPlot$group,
                          p.adjust.method = 'none')

#t <- pairwise.t.test(dataForBoxPlot$expr, dataForBoxPlot$group,
#                     p.adjust.method = 'none')

pValue <- c(t$p.value)[!is.na(c(t$p.value))]

df <- data.frame(x1=c(1,2,1, 1,3,1, 2,3,2), # 1 vs. 2; 1 vs. 3; 2 vs. 3
                 x2=c(1,2,2, 1,3,3, 2,3,3),
                 y1=c(5,5,5.2, 6,6,6.2, 5.5,5.5,5.7),
                 y2=c(5.2,5.2,5.2, 6.2,6.2,6.2, 5.7,5.7,5.7))


anno <- data.frame(x=c(1.5,2,2.5),
                   y=c(5.3,6.3,5.9),
                   label=as.character(symnum(pValue, #corr = FALSE, na = FALSE,
                                             cutpoints = c(0, 0.001, 0.01, 0.05, 1),
                                             symbols = c("***",'**','*','ns'))))

#anno <- data.frame(x=c(1.5,2,2.5),
#                   y=c(8.55,8.95,8.8),
#                   label = ifelse(pValue >= 0.01,
#                                  paste0('p = ', formatC(pValue, digits = 2)),
#                                  paste0('p = ', formatC(pValue, format = "e", digits = 2))))

ggplot(data=dataForBoxPlot, aes(x=group, y=expr)) +
  geom_boxplot(aes(fill=group),
               outlier.shape = NA, outlier.size = NA, #outlier.colour = 'black',
               outlier.fill = NA) +
  facet_wrap(~dataset, nrow=1) +
  geom_jitter(size=2, width=0.05, color='black') + #darkblue
  scale_color_manual(values=c("#56B4E9", "#E69F00")) +
  labs(x='', y=expression('Expression Level (Log'[2]*'CPM)')) +
  geom_segment(data=df,aes(x = x1, y = y1, xend = x2, yend = y2)) +
  geom_text(data =anno, aes(x, y, label=label, group=NULL),
            size=4) +
  theme(legend.position = 'none')+
  theme(axis.text = element_text(size=14,color='black'),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size=16)) +
  theme(strip.text = element_text(size=14, face='bold')) +
  theme(plot.margin =  margin(t = 0.25, r = 0.25, b = 0.25, l = 0.25, unit = "cm"))

### Add stats automatically, 3 groups

## ggsignif(), can provide self-defined annotations, otherwise perform wilcox test by default

dataForBoxPlot <- readRDS('data/dataForBoxPlot.rds')

dataForBoxPlot <- filter(dataForBoxPlot,
                         HistologyGrade %in% c('Normal','Mild dysplasia','Moderate dysplasia'))

dataForBoxPlot$expr <- dataForBoxPlot$DSG3
dataForBoxPlot$group <- factor(dataForBoxPlot$HistologyGrade,
                               levels=c('Normal','Mild dysplasia','Moderate dysplasia'))
  
t <- pairwise.wilcox.test(dataForBoxPlot$expr, dataForBoxPlot$group,
                          p.adjust.method = 'none')

#t <- pairwise.t.test(dataForBoxPlot$expr, dataForBoxPlot$group,
#                     p.adjust.method = 'none')
#t$p.value

pValue <- c(t$p.value)[!is.na(c(t$p.value))]

wtq <- levels(dataForBoxPlot$group)
lis <- combn(wtq,2)
my_comparisons <- tapply(lis, rep(1:ncol(lis), each=nrow(lis)), function(i) i)
my_comparisons <- my_comparisons[c(1,2,3)]

my_annotations <- as.character(symnum(pValue, #corr = FALSE, na = FALSE,
                                      cutpoints = c(0, 0.001, 0.01, 0.05, 1),
                                      symbols = c("***",'**','*','ns')))


#my_annotations <- ifelse(pValue >= 0.01, paste0('p = ', formatC(pValue, digits = 2)),
# paste0('p = ', formatC(pValue, format = "e", digits = 2)))

ggplot(data=dataForBoxPlot, aes(x=group, y=expr)) +
  geom_boxplot(aes(fill=group),
               outlier.shape = NA, outlier.size = NA,#outlier.colour = 'black',
               outlier.fill = NA) +
  #geom_smooth(method='lm') +
  facet_wrap(~dataset) +
  geom_jitter(size=2, width=0.05, color='black') +
  labs(x='', y=expression('Expression Level (Log'[2]*'CPM)')) +
  #geom_segment(data=df,aes(x = x1, y = y1, xend = x2, yend = y2)) +
  geom_signif(annotations = my_annotations, # optional
              comparisons = my_comparisons,
              step_increase = 0.1,
              vjust=.2,
              colour='gray20',
              tip_length=0.015) + # 0
  theme(legend.position = 'none')+
  theme(axis.text = element_text(size=14,color='black'),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size=16)) +
  theme(strip.text = element_text(size=14, face='bold')) +
  theme(plot.margin =  margin(t = 0.25, r = 0.25, b = 0.25, l = 0.25, unit = "cm"))

### Comparison within each group

dataForBoxPlot <- readRDS('data/dataForBoxPlot.rds')

dataForBoxPlot <- data.frame(expr=as.numeric(unlist(dataForBoxPlot[,1:5])),
                             group=rep(dataForBoxPlot$SampleType,5),
                             gene=rep(colnames(dataForBoxPlot)[1:5],each=nrow(dataForBoxPlot)),
                             stringsAsFactors = F)
dataForBoxPlot$group <- factor(dataForBoxPlot$group, levels=c('Normal','Dysplasia'))
dataForBoxPlot$gene <- factor(dataForBoxPlot$gene, levels=unique(dataForBoxPlot$gene))

#dataForBoxPlot <- dataForBoxPlot[-which(is.na(dataForBoxPlot$expr)),]
#dim(dataForBoxPlot)

# NOT WORKING UNLESS plyr IS UNLOADED
pValue <- dataForBoxPlot %>% group_by(gene) %>%
  do(test = wilcox.test(expr~group, data=., paired=FALSE)) %>%
  summarise(gene, pval = test$p.value)

pValue <- pValue$pval

my_annotations <- as.character(symnum(pValue, #corr = FALSE, na = FALSE,
                                      cutpoints = c(0, 0.001, 0.01, 0.05, 1),
                                      symbols = c("***",'**','*','ns')))

#my_annotations <- ifelse(pValue >= 0.01, paste0('p = ', formatC(pValue, digits = 2)),
# paste0('p = ', formatC(pValue, format = "e", digits = 2)))
#anno <- data.frame(x=1:5,y=17.5,label=my_annotations)

exprMax <- dataForBoxPlot %>% group_by(gene) %>% summarise(maxExpr=max(expr))

anno <- data.frame(x=exprMax$gene,y=exprMax$maxExpr+1,label=my_annotations,gene=exprMax$gene)
anno <- anno[which(anno$label!='ns'),]

p <- position_dodge(0.58)

bxplt <- ggplot(dataForBoxPlot, aes(x=gene, y=expr, group = interaction(group, gene)))
#fill=interaction(group,project))
#stat_boxplot(geom ='errorbar', width=0.5, position = position_dodge(width = 0.75))
#stat_summary(geom ='crossbar', width=0.5, color='white')

bxplt+geom_boxplot(aes(color=group, fill=group),
                   outlier.shape = 21, outlier.size = 1,#outlier.colour = 'black',
                   outlier.fill = NA, alpha=1, width=0.5,
                   position = p) +
  stat_summary(geom = "crossbar", width=0.45, fatten=0, color="white", position=p,
               fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x))) }) +
  #scale_fill_npg(palette = 'nrc', alpha = 0.5)+
  #scale_fill_manual(values=c('limegreen', 'blue'),
  #                  labels=c('Normal', 'Dysplasia'),
  #                  name='Sample type') +
  #geom_jitter(size=0.3)+
  labs(x='', y=expression('Expression Level (Log'[2]*'CPM)')) +
  #ylim(-5,20) +
  geom_text(data =anno, aes(x, y, label=label, group=NULL),
            size=5) +
  #geom_signif(annotations = my_annotations,
  #            y_position = rep(7.5,5),
  #            xmin = c(1,2,3,4,5)-0.15,
  #            xmax = c(1,2,3,4,5)+0.15,
  #            #comparisons = my_comparisons,
  #            step_increase = 0,
  #            vjust=.2,
  #            colour='gray20',
  #            tip_length=0) + # 0
  theme_bw()+
  theme(legend.title = element_blank(),
        legend.text = element_text(size=14),
        legend.position = 'right') +
  theme(axis.title=element_text(size=16),
        axis.text = element_text(color='black', size=14),
        axis.text.x = element_text(angle = 0, hjust=0.5)) +
  theme(axis.line = element_line(colour = "black"),
        panel.border = element_blank(),
        panel.background = element_blank()) +
  theme(plot.margin =  margin(t = 0.25, r = 0.25, b = 0.25, l = 0.25, unit = "cm"))

### Facet_wrap()
dataForBoxPlot <- readRDS('data/dataForBoxPlot.rds')

dataForBoxPlot <- data.frame(expr=as.numeric(unlist(dataForBoxPlot[,1:5])),
                             group=rep(dataForBoxPlot$SampleType,5),
                             gene=rep(colnames(dataForBoxPlot)[1:5],each=nrow(dataForBoxPlot)),
                             stringsAsFactors = F)
dataForBoxPlot$group <- factor(dataForBoxPlot$group, levels=c('Normal','Dysplasia'))
dataForBoxPlot$gene <- factor(dataForBoxPlot$gene, levels=unique(dataForBoxPlot$gene))

#dataForBoxPlot <- dataForBoxPlot[-which(is.na(dataForBoxPlot$expr)),]
#dim(dataForBoxPlot)

# NOT WORKING UNLESS plyr IS UNLOADED
pValue <- dataForBoxPlot %>% group_by(gene) %>%
  do(test = wilcox.test(expr~group, data=., paired=FALSE)) %>%
  summarise(gene, pval = test$p.value)

pValue <- pValue$pval

my_annotations <- as.character(symnum(pValue, #corr = FALSE, na = FALSE,
                                      cutpoints = c(0, 0.001, 0.01, 0.05, 1),
                                      symbols = c("***",'**','*','ns')))

#my_annotations <- ifelse(pValue >= 0.01, paste0('p = ', formatC(pValue, digits = 2)),
# paste0('p = ', formatC(pValue, format = "e", digits = 2)))
#anno <- data.frame(x=1:5,y=17.5,label=my_annotations)

exprMax <- dataForBoxPlot %>% group_by(gene) %>% summarise(maxExpr=max(expr))

anno <- data.frame(x=1.5,y=7.5,label=my_annotations,gene=exprMax$gene)
anno <- anno[which(anno$label!='ns'),]

ggplot(data=dataForBoxPlot, aes(x=group, y=expr)) +
  geom_boxplot(aes(fill=group),
               outlier.shape = NA, outlier.size = NA,#outlier.colour = 'black',
               outlier.fill = NA) +
  #geom_boxplot(aes(color=group, fill=group),
  #             outlier.shape = 21, outlier.size = 1,#outlier.colour = 'black',
  #             outlier.fill = NA, alpha=1, width=0.5) +
  #stat_summary(geom = "crossbar", width=0.45, fatten=0, color="white", position=p,
  #             fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x))) }) +
  facet_wrap(~gene, nrow=1) +
  geom_jitter(size=2, width=0.05, color='darkblue') +
  labs(x='', y=expression('Expression Level (Log'[2]*'CPM)')) +
  #geom_segment(data=df,aes(x = x1, y = y1, xend = x2, yend = y2)) +
  geom_text(data =anno, aes(x, y, label=label, group=NULL),
            size=5) +
  theme(legend.position = 'none')+
  theme(axis.text = element_text(size=14,color='black'),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size=16),
        strip.text = element_text(size=14, face='bold')) +
  theme(plot.margin =  margin(t = 0.25, r = 0.25, b = 0.25, l = 0.25, unit = "cm"))

1.2 Barplot

## # A tibble: 53 x 3
##    Subtype                                   expr    sd
##    <chr>                                    <dbl> <dbl>
##  1 Adipose - Subcutaneous                   3.03   1.29
##  2 Adipose - Visceral (Omentum)             4.00   1.75
##  3 Adrenal Gland                            2.53   1.17
##  4 Artery - Aorta                           3.53   1.42
##  5 Artery - Coronary                        3.46   1.57
##  6 Artery - Tibial                          2.48   1.27
##  7 Bladder                                  2.48   1.09
##  8 Brain - Amygdala                         0.909  1.80
##  9 Brain - Anterior cingulate cortex (BA24) 0.488  1.59
## 10 Brain - Caudate (basal ganglia)          1.08   1.65
## # … with 43 more rows

### Back to back

# normal

barData <- readRDS(file='data/dataForBarPlot_TCGA.rds')

barData <- barData[barData$Program=='TCGA',]

dataForBarPlot <- data.frame(Size=c(barData$Seq, barData$Exp),
                             Project=rep(barData$Project,2),
                             Type=rep(c('Seq','Exp'),nrow(barData)),
                             stringsAsFactors = F)


dataForBarPlot$Size <- ifelse(dataForBarPlot$Type=='Seq',dataForBarPlot$Size, dataForBarPlot$Size*-1)

o <- order(dataForBarPlot$Size[dataForBarPlot$Type=='Seq'], decreasing = T)

dataForBarPlot$Project <- factor(dataForBarPlot$Project,
                                 levels=dataForBarPlot$Project[dataForBarPlot$Type=='Seq'][o])



ggplot(dataForBarPlot,aes(y = Size, x = Project, fill = Type)) + 
  geom_bar(stat="identity", width = 0.8) + #,position=position_dodge(0.58)
  labs(x='', y='Number of Samples') +
  coord_flip() + #scale_x_discrete(limits=rev(levels(dataForBarPlot$Tissue))) +
  #facet_grid(Category ~ .) + #, scales = "free"
  #ylim(0,30) +
  #scale_fill_manual(values=fill) +
  scale_y_continuous(breaks=seq(-1000,1000,500), labels = abs(seq(-1000,1000,500))) +
  theme_bw()+theme(axis.line = element_line(colour = "black"),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.border = element_rect(colour='white'),
                   panel.background = element_blank()) +
  theme(axis.text=element_text(size=14, color='black'),
        axis.text.x =element_text(size=14, angle = 0, color='black', hjust = 0),
        axis.title.y =element_blank(), # flip x, y
        axis.title.x =element_text(size=16),
        strip.text = element_text(face = 'bold', size=12)) +
  theme(legend.text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = 'top') +
  theme(plot.margin =  margin(t = 0.25, r = 0.5, b = 0.25, l = 0.25, unit = "cm"))

### Side by side

# single
barData <- readRDS('data/dataForBarPlot_TCGA.rds')

barData <- barData[barData$Program=='TCGA',]

dataForBarPlot <- data.frame(Size=c(barData$Seq, barData$Exp),
                             Project=rep(barData$Project,2),
                             Type=rep(c('Seq','Exp'),each=nrow(barData)),
                             stringsAsFactors = F)

#dataForBarPlot$Size <- ifelse(dataForBarPlot$Type=='Seq',dataForBarPlot$Size, dataForBarPlot$Size*-1)

o <- order(dataForBarPlot$Size[dataForBarPlot$Type=='Seq'], decreasing = T)

dataForBarPlot$Project <- factor(dataForBarPlot$Project,
                                 levels=dataForBarPlot$Project[dataForBarPlot$Type=='Seq'][o])

ggplot(dataForBarPlot, aes(y = Size, x = Project, fill = Type)) + 
  geom_bar(stat="identity", width=0.8, position='dodge') +
  #geom_text(aes(y = pos, x = pathway, label = paste0(percentage, "%")),
  #           data = dataForPlot) +
  labs(x='', y='Number of Samples') +
  #facet_grid(Category ~ .) + #, scales = "free"
  #ylim(0,30) +
  #scale_fill_manual(values=fill) +
  #scale_y_continuous(labels = dollar_format(suffix = "%", prefix = "")) +
  theme_bw()+theme(axis.line = element_line(colour = "black"),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.border = element_rect(colour='white'),
                   panel.background = element_blank()) +
  theme(axis.text=element_text(size=14, color='black'),
        axis.text.x =element_text(size=14, angle = 45, color='black', hjust = 1),
        axis.title.x =element_blank(),
        axis.title.y =element_text(size=16),
        strip.text = element_text(face = 'bold', size=12)) +
  theme(legend.text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = 'top') +
  theme(plot.margin =  margin(t = 0.25, r = 0.5, b = 0.25, l = 0.25, unit = "cm"))

# side by side, multiple
barData <- readRDS('data/dataForBarPlot_TCGA.rds')

dataForBarPlot <- data.frame(Size=c(barData$Seq, barData$Exp),
                             Project=rep(barData$Project,2),
                             Type=rep(c('Seq','Exp'),each=nrow(barData)),
                             Program=rep(barData$Program,2),
                             stringsAsFactors = F)

o <- order(dataForBarPlot[dataForBarPlot$Type=='Seq',]$Program,
           dataForBarPlot[dataForBarPlot$Type=='Seq',]$Size,
           decreasing = T)

dataForBarPlot$Project <- factor(dataForBarPlot$Project, 
                                 levels = dataForBarPlot[dataForBarPlot$Type=='Seq',]$Project[o])

dataForBarPlot$Program <- factor(dataForBarPlot$Program, levels=c('TCGA','TARGET'))


ggplot(dataForBarPlot, aes(y = Size, x = Project, fill = Type)) + 
  geom_bar(stat="identity", width=0.8, position='dodge') +
  #geom_text(aes(y = pos, x = pathway, label = paste0(percentage, "%")),
  #           data = dataForPlot) +
  labs(x='', y='Number of Samples') +
  facet_wrap(~Program, scales='free', nrow=2) + #, scales = "free"
  #ylim(0,80) +
  #scale_fill_manual(values=fill) +
  #scale_y_continuous(labels = dollar_format(suffix = "%", prefix = "")) +
  theme_bw()+theme(axis.line = element_line(colour = "black"),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.border = element_rect(colour='black'),
                   panel.background = element_blank()) +
  theme(axis.text=element_text(size=14, color='black'),
        axis.text.x =element_text(size=14, angle = 45, color='black', hjust = 1),
        axis.title.x =element_blank(),
        axis.title.y =element_text(size=16),
        strip.text = element_text(face = 'bold', size=12)) +
  theme(legend.text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = 'top') + 
  theme(plot.margin =  margin(t = 0.25, r = 0.5, b = 0.25, l = 0.25, unit = "cm"))

# multiple

dataForBarPlot <- readRDS('data/dataForBarPlot_PDX.rds')
dataForBarPlot <- dataForBarPlot[with(dataForBarPlot, order(CANCER.TYPE, LOG2.FPKM.)),]
dataForBarPlot$PDX.MODEL <- factor(dataForBarPlot$PDX.MODEL, levels = dataForBarPlot$PDX.MODEL)

idx <- which(! 1:length(dataForBarPlot$CANCER.TYPE) %in% which(duplicated(dataForBarPlot$CANCER.TYPE)))

colors = c('red','orange','yellow','springgreen','royalblue','cyan','purple','pink',
           'tomato', 'chocolate','goldenrod','yellowgreen','dodgerblue','turquoise',
           'mediumorchid','hotpink','brown','coral','khaki','limegreen','skyblue',
           'aquamarine','magenta','tan','blue','seagreen','violet','maroon','peru')

ggplot(data=dataForBarPlot, aes(x=PDX.MODEL, y=LOG2.FPKM., fill=CANCER.TYPE, color=CANCER.TYPE)) +
  geom_bar(stat='identity', width=0.01) + #coord_flip()
  #geom_errorbar(aes(ymin=expr, ymax=expr+sd), width=.2, size=0.5, #expr-sd
  #              position=position_dodge(.9)) +
  labs(x='', y=expression('Gene Expression (Log'[2]*'FPKM)'))+#, title='FPR1 Expresion (Microarray)') +
  scale_x_discrete(breaks = dataForBarPlot$PDX.MODEL[idx]) +
  scale_fill_manual(values = colors) +
  scale_color_manual(values = colors) +
  theme_bw()+
  #scale_y_continuous(trans = 'sqrt',
  #                   breaks = c(0,2.5,50,250,750),
  #                   labels = c(0,2.5,50,250,750)) +
  #scale_y_sqrt() +
  #scale_y_continuous(trans='log2') +
  ylim(-2,6)+
  theme(legend.title = element_blank(),
        legend.text = element_text(size=12),
        legend.position = 'bottom') +
  theme(axis.title=element_text(size=16),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size=14)) +
  theme(axis.line = element_line(colour = "black"),
        panel.border = element_blank(),
        panel.background = element_blank(),
        panel.grid = element_blank(),
        panel.grid.major = element_blank()) +
  theme(plot.margin =  margin(t = 0.25, r = 0.25, b = 0.25, l = 0.25, unit = "cm"))

### Stacked

# Normal

# single
barData <- readRDS('data/dataForBarPlot_TCGA.rds')

barData <- barData[barData$Program=='TCGA',]

dataForBarPlot <- data.frame(Size=c(barData$Seq, barData$Exp),
                             Project=rep(barData$Project,2),
                             Type=rep(c('Seq','Exp'),each=nrow(barData)),
                             stringsAsFactors = F)

#dataForBarPlot$Size <- ifelse(dataForBarPlot$Type=='Seq',dataForBarPlot$Size, dataForBarPlot$Size*-1)

o <- order(dataForBarPlot$Size[dataForBarPlot$Type=='Seq'], decreasing = T)

dataForBarPlot$Project <- factor(dataForBarPlot$Project,
                                 levels=dataForBarPlot$Project[dataForBarPlot$Type=='Seq'][o])

ggplot(dataForBarPlot, aes(y = Size, x = Project, fill = Type)) + 
  geom_bar(stat="identity", width=0.8) +
  #geom_text(aes(y = pos, x = pathway, label = paste0(percentage, "%")),
  #           data = dataForPlot) +
  labs(x='', y='Number of Samples') +
  #facet_grid(Category ~ .) + #, scales = "free"
  #ylim(0,60) +
  scale_fill_manual(values=c("#4285F4", "#FBBC05")) +
  #scale_fill_manual(values=c("#56B4E9", "#E69F00")) +
  #scale_y_continuous(labels = dollar_format(suffix = "%", prefix = "")) +
  theme_bw()+theme(axis.line = element_line(colour = "black"),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.border = element_rect(colour='white'),
                   panel.background = element_blank()) +
  theme(axis.text=element_text(size=14, color='black'),
        axis.text.x =element_text(size=14, angle = 45, color='black', hjust = 1),
        axis.title.x =element_blank(),
        axis.title.y =element_text(size=16),
        strip.text = element_text(face = 'bold', size=12)) +
  theme(legend.text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = 'top') +
  theme(plot.margin =  margin(t = 0.25, r = 0.25, b = 0.25, l = 0.25, unit = "cm"))

# Percentage label
barData <- readRDS('data/dataForBarPlot_TCGA.rds')

barData <- barData[barData$Program=='TCGA',]

barData$ExpPercent <- round(barData$Exp/(barData$Exp+barData$Seq),2)*100
barData$SeqPercent <- 100 - barData$ExpPercent

dataForBarPlot <- data.frame(Percent=c(barData$SeqPercent, barData$ExpPercent),
                             Project=rep(barData$Project,2),
                             Type=rep(c('Seq','Exp'),each=nrow(barData)),
                             stringsAsFactors = F)

pos <- dataForBarPlot %>% group_by(Project) %>% 
  transmute(pos=cumsum(Percent)-0.5*Percent)

dataForBarPlot$Pos <- pos$pos

o <- order(dataForBarPlot$Percent[dataForBarPlot$Type=='Seq'], decreasing = T)

dataForBarPlot$Project <- factor(dataForBarPlot$Project,
                                 levels=dataForBarPlot$Project[dataForBarPlot$Type=='Seq'][o])

fill <- c("#40b8d0", "#b2d183")
#fill <- c("orange", "dodgerblue")

ggplot(dataForBarPlot,
       aes(y = Percent, x = Project, fill = Type)) +
  geom_bar(stat="identity") +
  geom_text(data = dataForBarPlot,
            aes(y = Pos, x = Project, label = paste0(Percent, "%"))) +
  labs(x='', y='Percentage') +
  scale_fill_manual(values=fill) +
  scale_y_continuous(labels = dollar_format(suffix = "%", prefix = "")) +
  theme_bw()+theme(axis.line = element_line(colour = "black"),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.border = element_rect(colour='white'),
                   panel.background = element_blank()) +
  theme(axis.text=element_text(size=14, color='black'),
        axis.text.x =element_text(size=14, angle = 45, color='black', hjust = 1),
        axis.title.x =element_blank(),
        axis.title.y =element_text(size=16)) +
  theme(legend.text = element_text(size=14),
        legend.title = element_blank(),
        legend.position = 'top') +
  theme(plot.margin =  margin(t = 0.25, r = 0.25, b = 0.25, l = 0.25, unit = "cm"))

1.3 Scatter plot

# Correlation, facet_wrap

dataForScatterPlot <- readRDS('data/dataForBoxPlot.rds')
dataForScatterPlot$SampleType <- factor(dataForScatterPlot$SampleType,
                                        levels=c('Normal','Dysplasia'))

#cor.test(scatterData$DSG3, scatterData$IL1R2)
#cor.test(scatterData$DSG3, scatterData$CLDN18)

#dataForScatterPlot <- data.frame(ref=rep(scatterData$DSG3,2),
#                                 expr=c(scatterData$IL1R2, scatterData$CLDN18),
#                                 gene=rep(c('IL1R2', 'CLDN18'), each=nrow(scatterData)))

#anno <- dataForScatterPlot %>% group_by(gene) %>%
#  summarise(cor = cor.test(ref, expr)$estimate, p = cor.test(ref, expr)$p.value)

anno <- dataForScatterPlot %>% group_by(SampleType) %>%
  summarise(cor = cor.test(DSG3, IL1R2)$estimate, p = cor.test(DSG3, IL1R2)$p.value)

anno_text <- data.frame(
  #label = c("corr = 0.446\np < 0.001", "corr = 0.543\np < 0.001"),
  SampleType=anno$SampleType, #SampleType is used for facet_wrap()
  label = paste0('corr = ', round(anno$cor,3), '\n',
                 'p = ', ifelse(anno$p >= 0.01,
                                formatC(anno$p, digits = 2),
                                formatC(anno$p, format = "e", digits = 2))),
  x     = c(2.5,2.5),
  y     = c(4,4)
)

ggplot(data=dataForScatterPlot, aes(x=DSG3, y=IL1R2)) +
  geom_point(size=2, color='darkred') +
  geom_smooth(method='lm') +
  geom_text(data    = anno_text,
            mapping = aes(x = x, y = y, label = label),
            size=4.5) +
  facet_wrap(~SampleType) +
  labs(x='Expression Level (DSG3)', y='Expression Level (IL1R2)') +
  theme(legend.position = 'none')+
  theme(axis.text = element_text(size=14),
        axis.title = element_text(size=16),
        strip.text.x = element_text(size=14, face='bold'))

# paired scatter
dataForScatterPlot <- readRDS('data/dataForScatterPlot_Treatment_Response.rds')

dataForScatterPlot$group <- factor(dataForScatterPlot$group,
                                   levels=c('anti-PD1 R (Baseline)', 'anti-PD1 R (Treatment)',
                                            'anti-PD1 NR (Baseline)', 'anti-PD1 NR (Treatment)',
                                            'anti-CTLA4 R (Baseline)', 'anti-CTLA4 R (Treatment)',
                                            'anti-CTLA4 NR (Baseline)', 'anti-CTLA4 NR (Treatment)'))

dataForScatterPlot$groupCol <- factor(dataForScatterPlot$groupCol,
                                      levels=c('anti-PD1 R', 'anti-PD1 NR',
                                               'anti-CTLA4 R', 'anti-CTLA4 NR'))

wtq <- levels(dataForScatterPlot$group)
lis <- combn(wtq,2)

my_comparisons <- tapply(lis, rep(1:ncol(lis), each=nrow(lis)), function(i) i)

idx <- which(!duplicated(unlist(lapply(my_comparisons, function(x) x[1]))))
my_comparisons <- my_comparisons[idx]
idx <- grep('Baseline', unlist(lapply(my_comparisons, function(x) x[1])))
my_comparisons <- my_comparisons[idx]

pValue <- c()

for (i in 1:length(my_comparisons)) {
  #print (my_comparisons[[i]])

  comparisons <- my_comparisons[[i]]

  idx1 <- which(dataForScatterPlot$group==comparisons[1])
  idx2 <- which(dataForScatterPlot$group==comparisons[2])

  df1 <- data.frame(expr=dataForScatterPlot$expr[idx1], 
                    subject=dataForScatterPlot$subject[idx1], 
                    group=comparisons[1])
  df2 <- data.frame(expr=dataForScatterPlot$expr[idx2], 
                    subject=dataForScatterPlot$subject[idx2], 
                    group=comparisons[2])



  ovlp <- intersect(df1$subject, df2$subject)

  expr1 <- df1$expr[match(ovlp, df1$subject)]
  expr2 <- df2$expr[match(ovlp, df2$subject)]

  #print (expr1)
  #print (expr2)


  p <- wilcox.test(expr1, expr2, paired=TRUE)$p.value
  #print (p)
  #p2 <- wilcox.test(expr1, expr2)$p.value
  #print (p2)
  pValue <- c(pValue, p)

}

exprMax <- max(dataForScatterPlot$expr) + 0.2

df <- data.frame(x1=c(1,2,1,3,4,3,5,6,5,7,8,7),
                 x2=c(1,2,2,3,4,4,5,6,6,7,8,8),
                 y1=rep(c(exprMax,exprMax,exprMax+0.05),length(pValue)),
                 y2=rep(c(exprMax+0.05, exprMax+0.05, exprMax+0.05),length(pValue)))

anno <- data.frame(x=c(1.5,3.5,5.5,7.5),
                   y=rep(exprMax+0.12,length(pValue)),
                   label=as.character(symnum(pValue, #corr = FALSE, na = FALSE,
                                             cutpoints = c(0, 0.001, 0.01, 0.05, 1),
                                             symbols = c("***",'**','*','ns'))))

ggplot(dataForScatterPlot, aes(x=group, y=expr, group=subject, color=groupCol)) +
  geom_point(size=2) +
  geom_line() +
  #facet_wrap(~dataset) +
  labs(x='', y=expression('Expression Level (Log'[2]*'Intensity)')) +
  geom_segment(data = df,aes(x = x1, y = y1, xend = x2, yend = y2, color=NULL, group=NULL)) +
  geom_text(data = anno, aes(x, y, label=label, group=NULL, color=NULL),
            size=4.5) + # group=NULL, color=NULL should be specified
  theme(legend.position = 'none')+
  theme(axis.text = element_text(size=14,color='black'),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size=16)) +
  theme(strip.text = element_text(size=16)) +
  theme(plot.margin =  margin(t = 0.25, r = 0.25, b = 0.25, l = 0.25, unit = "cm"))

1.4 Volcano plot

library(ggrepel)

dataForVolcanoPlot <- readRDS(file='data/dataForVolcanoPlot.rds')

logFcThreshold <- log2(1.5)
adjPvalThreshold <- 0.05

dataForVolcanoPlot$Significance[with(dataForVolcanoPlot, 
                                     logFC < logFcThreshold | adj.P.Val > adjPvalThreshold)] <- 'NS'
dataForVolcanoPlot$Significance[with(dataForVolcanoPlot, 
                                     logFC >= logFcThreshold & adj.P.Val <= adjPvalThreshold)] <- 'UP'
dataForVolcanoPlot$Significance[with(dataForVolcanoPlot, 
                                     logFC <= -logFcThreshold & adj.P.Val <= adjPvalThreshold)] <- 'DOWN'

ggplot(dataForVolcanoPlot, aes(x = logFC, y = -log10(adj.P.Val))) +
  #xlim(-7.5,7.5)
  labs(x=expression('log'[2]*'(Fold Change)'), 
       y=(expression('-log'[10]*'(FDR)')), 
       title=NULL) +
  geom_point(aes(color=Significance), alpha=1, size=2) +
  geom_vline(xintercept = c(-logFcThreshold, logFcThreshold),
             color='darkgreen', linetype='dashed') +
  geom_hline(yintercept = -log10(adjPvalThreshold), 
             color='darkgreen',linetype='dashed')+
  #scale_x_continuous(breaks=c(-4,-2,0,2,4,6,8,10)) +
  #scale_y_continuous(expand = c(0.3, 0)) +
  #scale_color_manual(values = c('#4285F4',"gray", '#FBBC05')) +
  scale_color_manual(values = c('green3',"black", "red")) +
  #facet_wrap(~Comparison, ncol = 2) +
  geom_text_repel(data = subset(dataForVolcanoPlot, 
                                adj.P.Val < adjPvalThreshold & logFC > log2(2)), 
                  segment.alpha = 0.4, aes(label = Symbol), 
                  size = 3.5, color='red', segment.color = 'black') +
  geom_text_repel(data = subset(dataForVolcanoPlot, 
                                adj.P.Val < adjPvalThreshold & logFC < log2(2)*-1), 
                  segment.alpha = 0.4, aes(label = Symbol), 
                  size = 3.5, color='green3', segment.color = 'black') +
  theme(legend.position="none") +
  theme(axis.text=element_text(size=14),
        axis.title=element_text(size=16),
        strip.text = element_text(size=14, face='bold')) +
  theme(plot.margin =  margin(t = 0.25, r = 0.25, b = 0.25, l = 0.25, unit = "cm"))

ggplot(dataForVolcanoPlot, aes(x = logFC, y = -log10(adj.P.Val))) +
  #xlim(-7.5,7.5)
  labs(x=expression('log'[2]*'(Fold Change)'), 
       y=(expression('-log'[10]*'(FDR)')), 
       title=NULL) +
  geom_point(aes(color=Significance), alpha=1, size=2) +
  geom_vline(xintercept = c(-logFcThreshold, logFcThreshold),
             color='darkgreen', linetype='dashed') +
  geom_hline(yintercept = -log10(adjPvalThreshold), 
             color='darkgreen',linetype='dashed')+
  #scale_x_continuous(breaks=c(-4,-2,0,2,4,6,8,10)) +
  #scale_y_continuous(expand = c(0.3, 0)) +
  #scale_color_manual(values = c('#4285F4',"gray", '#FBBC05')) +
  scale_color_manual(values = c('green3',"black", "red")) +
  #facet_wrap(~Comparison, ncol = 2) +
  geom_text_repel(data = subset(dataForVolcanoPlot, 
                                adj.P.Val < adjPvalThreshold & logFC > log2(2)), 
                  segment.alpha = 0.4, aes(label = Symbol), 
                  size = 3.5, color='red', segment.color = 'black') +
  geom_text_repel(data = subset(dataForVolcanoPlot, 
                                adj.P.Val < adjPvalThreshold & logFC < log2(2)*-1), 
                  segment.alpha = 0.4, aes(label = Symbol), 
                  size = 3.5, color='green3', segment.color = 'black') +
  theme_bw() +
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_rect(colour='black'),
        panel.background = element_blank()) +
  theme(axis.text=element_text(size=14),
        axis.title=element_text(size=16),
        plot.title = element_text(size = 14, face = 'bold', hjust = 0.5),
        legend.position = 'none',
        #legend.text = element_text(size = 14),
        #legend.title = element_text(size = 14, face = 'bold'),
        strip.text = element_text(size = 14, face = 'bold'))

1.5 Venn Diagram

## A Venn object on 5 sets named
## A,B,C,D,E 
## 00000 10000 01000 11000 00100 10100 01100 11100 00010 10010 01010 11010 
##     0    28    70    43    20    13    34    31    47    47    84    52 
## 00110 10110 01110 11110 00001 10001 01001 11001 00101 10101 01101 11101 
##    22    17    44    25    29    15    41    23    10    15    14    16 
## 00011 10011 01011 11011 00111 10111 01111 11111 
##    40    19    37    27    18    12    42    17
##  [1] 159 427 278 306 638 845 695 262 121 827 849 317 209 205 346 955 914

1.6 UpSet plot

1.7 Heatmap

library(ComplexHeatmap)
library(RColorBrewer)
library(GEOquery)

eSet <- readRDS('data/dataForHeatmap_eSet.rds')
exprData <- exprs(eSet)
phenoData <- pData(eSet)

phenoData$SampleType <- factor(phenoData$SampleType, levels=c('Normal','Dysplasia'))
phenoData$HistologyGrade <- factor(phenoData$HistologyGrade,
                                   levels=c('Normal','Hyperplasia','Mild dysplasia',
                                            'Moderate dysplasia','Severe dysplasia'))

o <- order(phenoData$SampleType, phenoData$HistologyGrade)
phenoData <- phenoData[o,]
exprData <- exprData[,o]

scaledExprData <- t(scale(t(exprData)))

annoColors <- list(
  `Histology Grade`=c(Normal='gray96',
                   Hyperplasia='gray72',
                   `Mild dysplasia`='gray48',
                   `Moderate dysplasia`='gray14',
                   `Severe dysplasia`='gray0'),
  `Sample Type`=c(Normal='darkolivegreen',
                  Dysplasia='lightcoral'))

topAnnotation = HeatmapAnnotation(`Sample Type`=phenoData[,'SampleType'],
                                  `Histology Grade`=phenoData[,'HistologyGrade'],
                                  col=annoColors,
                                  simple_anno_size_adjust = TRUE,
                                  #annotation_height = c(1,1),
                                  height = unit(8, "mm"),
                                  #summary = anno_summary(height = unit(4, "cm")),
                                  show_legend = c("bar" = TRUE),
                                  show_annotation_name = F)


col_fun = colorRampPalette(rev(c("red",'white','blue')), space = "Lab")(100)
#col_fun = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlBu")))(100)
# col_fun = colorRamp2(c(-4, 0, 4), c("blue", "white", "red")) # For Legend()
#library(colorspace)
#col_fun = diverge_hcl(100, c = 100, l = c(50,90), power = 1.5)


### pre-defined order of columns
ht <- Heatmap(scaledExprData,
              #name = 'Expression',

              # COLOR
              #col = colorRampPalette(rev(c("red",'white','blue')), space = "Lab")(100),
              col=col_fun,
              na_col = 'grey',

              # MAIN PANEL
              column_title = NULL,
              cluster_columns = FALSE,
              cluster_rows = TRUE,
              show_row_dend = TRUE,
              show_column_dend = TRUE,
              show_row_names = FALSE,
              show_column_names = TRUE,
              column_names_rot = 90,
              column_names_gp = gpar(fontsize = 10),
              #column_names_max_height = unit(3, 'cm'),
              #column_split = factor(phenoData$Day,
              #                      levels=str_sort(unique(phenoData$Day), numeric = T)),

              column_order = rownames(phenoData),

              # ANNOTATION
              top_annotation = topAnnotation,

              # LEGEND
              heatmap_legend_param = list(
                #at = c(-5, 0, 5),
                #labels = c("low", "zero", "high"),
                title = "Expression",
                title_position = 'leftcenter-rot',
                legend_height = unit(3, "cm"),
                adjust = c("right", "top")
              ))

draw(ht,annotation_legend_side = "right",row_dend_side = "left")