Chapter 2 Gene Expression Omnibus (GEO)
Download and preprocess transcriptomic data from GEO
2.1 Affymetrix
### Annotation (GV Version 20.0.0)
# Human Genome U133 Plus 2.0 Assay: pd.hg.u133.plus.2 [Manufacturer, Biocondutor]
# Human Gene 1.0 ST Assay: pd.hugene10st.hs.ensg [CDF]
# Human Gene 2.0 ST Assay: pd.hugene20st.hs.ensg [CDF]
# IN USE
#BiocManager::install('pd.hg.u133.plus.2')
#install.packages("http://mbni.org/customcdf/20.0.0/ensg.download/pd.hugene10st.hs.ensg_20.0.0.tar.gz",
# repos = NULL, type = "source")
#install.packages("http://mbni.org/customcdf/20.0.0/ensg.download/pd.hugene20st.hs.ensg_20.0.0.tar.gz",
# repos = NULL, type = "source")
### Annotation (Latest Version 23.0.0, Oct 26, 2018)
# Ensembl
#install.packages("http://mbni.org/customcdf/23.0.0/ensg.download/pd.hgu133plus2.hs.ensg_23.0.0.tar.gz",
# repos = NULL, type = "source")
# Gencode (Ensembl ID with version, not recommended)
#install.packages("http://mbni.org/customcdf/23.0.0/gencodeg.download/pd.hgu133plus2.hs.gencodeg_23.0.0.tar.gz",
# repos = NULL, type = "source")
library(pd.hg.u133.plus.2)
#library(pd.hgu133plus2.hs.gencodeg)
#library(pd.hgu133plus2.hs.ensg)
pd.hg.u133.plus.2@tableInfo
## tbl row_count
## 1 featureSet 54675
## 2 mmfeature 604258
## 3 pmfeature 604258
gse <- 'GSE9452'
### Expression Data
celFiles = list.celfiles(paste0('data/fromGEO/', gse, '_RAW'), full.names=T, listGzipped=T)
celFiles
rawData = read.celfiles(celFiles, pkgname = 'pd.hg.u133.plus.2')
#rawData@annotation
probesetData = oligo::rma(rawData)
#probesetData
#probesetData@featureData@data
exprData = exprs(probesetData)
colnames(exprData) <- unlist(lapply(colnames(exprData), function(x) strsplit(x, '.', fixed=T)[[1]][1]))
#saveRDS(exprData, file=paste0('data/rData/', gse, '_Expr.rds'))
### Phenotype Data
seriesMatrix <- getGEO(gse, AnnotGPL = TRUE, GSEMatrix = TRUE, destdir = 'data/fromGEO/') # AnnotGPL = TRUE
#seriesMatrix <- getGEO(filename=paste0('data/fromGEO/', gse, '_series_matrix.txt.gz'), AnnotGPL = FALSE) # AnnotGPL = TRUE
#exprData2 <- exprs(seriesMatrix[[1]])
#exprData2[1:5,1:5]
phenoData <- pData(seriesMatrix[[1]])
###
disease <- as.character(phenoData$characteristics_ch1)
disease[grepl('Control', disease, fixed=T)] <- 'Healthy control'
disease[grepl('no macroscopic signs of inflammation', disease, fixed=T)] <- 'UC non-inflamed'
disease[grepl('macroscopic inflammation vissible', disease, fixed=T)] <- 'UC inflammation'
tissue <- 'Colon'
subject <- str_match(phenoData$title, '\\d+')
phenoData <- data.frame(disease,
tissue,
subject,
row.names=rownames(phenoData))