Microarray preliminary analysis in R
Microarrays can be used to characterize the expression profile of a battery of known genes in cells, enabling research on several biology topics such as cancer.
In this entry I will show how to get started in their analysis. The microarray data used here is available at GEO and was originally published in this Nature article.
Antonio Ortega
February 21, 2017
# Load microarray packages
library("affy")
library("annaffy")
library("simpleaffy")
library("affyPLM")
library("limma")
library("oligo")
# Load tidyverse packages
library("magrittr")
library("ggplot2")
library("tidyr")
library("dplyr")
library("ggvis")
library("shiny")
Data input
microarray.raw.data <- read.celfiles(filenames = list.files(path = "GSE62166_RAW",
full.names = T ))
## Reading in : GSE62166_RAW/GSM1520869_Greece_EA_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520870_Greece_EB_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520871_Greece_EC_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520872_Greece_ED_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520873_Greece_FA_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520874_Greece_FB_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520875_Greece_FD_36_20140401_HTA-2_0.CEL.gz
Data preprocessing
microarray.processed.data <- rma(microarray.raw.data)
## Background correcting
## Normalizing
## Calculating Expression
expression.level <- exprs(microarray.processed.data)
Before and after rma algorithm histograms
par(mfrow=c(1,2))
hist(microarray.raw.data %>% exprs, col = rainbow(75))
hist(expression.level, col = rainbow(75))
Metadata
probe.names <- row.names(expression.level)
conditions.ids <- c("on", "off")
groups <- rep(conditions.ids, times = c(4, 3))
id <- c(1:4, 1:3)
samples <- paste(groups, id, sep="_")
colnames(expression.level) <- samples
Data munging
data_wide <- cbind(probe.names = probe.names, expression.level) %>% as.data.frame
data_long <- gather(data_wide, condition,
measurement, on_1:off_3,
factor_key=TRUE)
## Warning: attributes are not identical across measure variables; they will
## be dropped
data_long <- data_long %>% mutate(measurement = as.numeric(measurement))
data_long$replica <- rep(c(1:4, 1:3), each = 70523)
data_long$condition <- rep(c("on", "off"),
times = c(70523 * 4, 70523 * 3))
data_long <- data_long[, c(1, 2, 4, 3)]
complete_long <- data_long[complete.cases(data_long),]
summarised <- complete_long %>% group_by(probe.names, condition) %>%
summarize(media = mean(measurement))
data_wide <- summarised %>% spread(condition, media)
Data visualization with ggplot2 and ggvis
#ggplot2
p <-ggplot(data = data_wide) +
geom_point(aes(x = on, y = off))
p
#ggvis
# Uncomment when running R interactively. A interactive graphic is generated
# that shows the probe name when the mouse is hovered over a point in the scatterplot
# base <- data_wide %>%
# head(100) %>%
# ggvis(~on, ~off) %>%
# layer_points
#base %>% add_tooltip(function(data_wide) { paste0(data_wide$probe.names)} , "hover")