Microarrays can be used to characterize the expression profile of a battery of known genes in cells, enabling research on several biology topics such as cancer.

In this entry I will show how to get started in their analysis. The microarray data used here is available at GEO and was originally published in this Nature article.

# Load microarray packages
library("affy")
library("annaffy")
library("simpleaffy")
library("affyPLM")
library("limma")
library("oligo")
# Load tidyverse packages
library("magrittr")
library("ggplot2")
library("tidyr")
library("dplyr")
library("ggvis")
library("shiny")

Data input

microarray.raw.data <- read.celfiles(filenames = list.files(path = "GSE62166_RAW",
                                                            full.names = T ))
## Reading in : GSE62166_RAW/GSM1520869_Greece_EA_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520870_Greece_EB_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520871_Greece_EC_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520872_Greece_ED_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520873_Greece_FA_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520874_Greece_FB_36_20140401_HTA-2_0.CEL.gz
## Reading in : GSE62166_RAW/GSM1520875_Greece_FD_36_20140401_HTA-2_0.CEL.gz

Data preprocessing

microarray.processed.data <- rma(microarray.raw.data)
## Background correcting
## Normalizing
## Calculating Expression
expression.level <- exprs(microarray.processed.data)

Before and after rma algorithm histograms

par(mfrow=c(1,2))
hist(microarray.raw.data %>% exprs, col = rainbow(75))
hist(expression.level, col = rainbow(75))

Metadata

probe.names <- row.names(expression.level)
conditions.ids <- c("on", "off")
groups <- rep(conditions.ids, times = c(4, 3))
id <- c(1:4, 1:3)
samples <- paste(groups, id, sep="_")
colnames(expression.level) <- samples

Data munging

data_wide <- cbind(probe.names = probe.names, expression.level) %>% as.data.frame
data_long <- gather(data_wide, condition,
                    measurement, on_1:off_3,
                    factor_key=TRUE)
## Warning: attributes are not identical across measure variables; they will
## be dropped
data_long <- data_long %>% mutate(measurement = as.numeric(measurement))
data_long$replica <- rep(c(1:4, 1:3), each = 70523)
data_long$condition <- rep(c("on", "off"),
                           times = c(70523 * 4, 70523 * 3))
data_long <- data_long[, c(1, 2, 4, 3)]
complete_long <- data_long[complete.cases(data_long),]
summarised <- complete_long %>% group_by(probe.names, condition) %>%
  summarize(media = mean(measurement))

data_wide <- summarised %>% spread(condition, media)

Data visualization with ggplot2 and ggvis

#ggplot2
p <-ggplot(data = data_wide) +
  geom_point(aes(x = on, y = off))

p

#ggvis

# Uncomment when running R interactively. A interactive graphic is generated
# that shows the probe name when the mouse is hovered over a point in the scatterplot

# base <- data_wide %>%
#   head(100) %>%
#   ggvis(~on, ~off) %>%
#   layer_points

#base %>% add_tooltip(function(data_wide) { paste0(data_wide$probe.names)} , "hover")