# Eric Minikel
# CureFFI.org
# 2013-08-23
# Gene expression analysis QC pipeline demo script

# On Unix systems, use these commands to get the data.
# wget ftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS4nnn/GDS4146/soft/GDS4146.soft.gz
# wget ftp://ftp.ncbi.nlm.nih.gov/geo/platforms/GPLnnn/GPL97/annot/GPL97.annot.gz
# wget http://www.cureffi.org/wp-content/uploads/2013/08/metadata.txt
# also manually download the Human BodyMap 2.0 FPKMs: https://docs.google.com/file/d/0BxObyChB_2VXUTRyczRITEVHTG8/edit?usp=sharing
# gunzip *.gz

# set working directory - this should be the only thing you need to change to get this code to work for you
setwd('c:/sci/026rplcl/data/microarray/practice2/')

# only if you don't have these packages already:
# install.packages("stringr")
# install.packages("gap")
# install.packages("sqldf")
# install.packages("reshape")
library(stringr) # makes string splitting, indexing, regexes easier - R's base functions are quite bad at this
library(gap) # best library I've found for qq plots
library(sqldf) # for me, JOIN & GROUP BY are easier to use than R's match/merge and aggregate
library(reshape) # for converting between matrices and relational models
options(stringsAsFactors=FALSE) # so that strings stay as strings in all read.table calls

annot = read.table('GPL97.annot',quote='',comment.char='!',skip=27,sep='\t',header=TRUE)
colnames(annot) # check that the header read correctly
dim(annot) # check right number of rows and columns
annot$Chromosome = str_extract(annot$Chromosome.location,".*?[p|q]") # Chromosome by itself is often not included in annotations, but should be.
annot$Chromosome = gsub("[p|q]","",annot$Chromosome)

ma_raw = read.table('GDS4146.soft',skip=188,comment.char='!',header=TRUE,sep='\t')
colnames(ma_raw)
# peek at the data
ma_raw[1:5,1:5]

# example of how to parse sex out of column names like GSM601942.M, if they were formatted that way
example_names = c("probe_id","A10.patient01.M","C05.patient47.F","D07.patient11.F")
example_matrix = matrix(nrow=1,ncol=4)
colnames(example_matrix) = example_names
example_sex=as.character(sapply(strsplit(colnames(example_matrix)[2:4],split="\\."),"[[",3))

# extract metadata from header
expt_id = as.integer(sapply(strsplit(colnames(ma_raw)[3:127],split="M"),"[[",2))
timepoint_raw = (expt_id + 3)%%5
patient_id_raw = floor( (expt_id + 3) / 5) - 120375 + 1
# in this case, sex isn't in the header - I extracted it manually from the SOFT file into a separate file.
metadata = read.table('metadata.txt',header=TRUE,sep='\t')

# convert the whole raw data frame into a matrix
probes = ma_raw[,1:2] # save the probe info to a separate data frame
ma_mat = as.matrix(ma_raw[3:127]) # convert the values part of the data frame to a matrix
rownames(ma_mat) = probes[,1] # assign the probe ids as row names
# peek at the data
ma_mat[1:5,1:5]

# for this demonstration, I'm just going to use a subset of the columns
ma_mat = as.matrix(ma_raw[,which(timepoint_raw==3)+2]) # use all 25 patients but only at timepoint #3
dim(ma_mat) # check that it worked

# match metadata and rename columns
sex = metadata$sex[match(colnames(ma_mat),metadata$sample_id)]
patient_id = metadata$patient_id[match(colnames(ma_mat),metadata$sample_id)]
colnames(ma_mat) = paste("P",formatC(patient_id,width=2,flag="0"),sep='')
rownames(ma_mat) = ma_raw$ID_REF

# peek at the data again
ma_mat[1:5,1:5]

# create a histogram of the probe levels
png('probe.level.histogram.png')
hist(ma_mat,breaks=100,col='yellow',main='Histogram of probe expression levels',xlab='Expression level')
dev.off()

# create a histogram of probe levels of highly expressed probe targets
png('probe.level.histogram.10000plus.png')
hist(ma_mat[ma_mat > 10000],breaks=100,col='yellow',main='Histogram of probe expression levels > 10,000',xlab='Expression level')
dev.off()

# create an all-by-all correlation matrix
cormat = matrix(nrow=25,ncol=25)
for (i in 1:25) {
  for (j in 1:25) {
    cormat[i,j] = cor.test(ma_mat[,i],ma_mat[,j])$estimate
  }
}
colnames(cormat) = colnames(ma_mat)
rownames(cormat) = colnames(ma_mat)

# what are the highest and lowest correlation (highest is always 1 since you compare each sample to itself in the above loop)
range(cormat)

# use image() to make a DIY heat map - this is more flexible than heatmap(cormat) because it uses a reasonable coordinate system
png('allxall.correlation.matrix.png')
image(1:25,1:25,cormat,xaxt='n',xlab='',yaxt='n',ylab='',main='Protein expression profile correlation between samples',cex.main=.8)
for (i in 1:25) {
  for (j in 1:25) {
    text(i,j,labels=format(cormat[i,j],digits=2),cex=.5)
  }
}
axis(side=1,at=seq(1,25,1),labels=colnames(ma_mat),cex.axis=.6,las=2)
axis(side=2,at=seq(1,25,1),labels=colnames(ma_mat),cex.axis=.6,las=2)
dev.off()

# histogram of correlations between samples
png('allxall.correlation.histogram.png')
hist(cormat,breaks=100,main="Histogram of Pearson's correlation between different samples",col='yellow')
dev.off()

# check which genes are most highly expressed
unique(annot$Gene.symbol[rowMeans(ma_mat) > 50000])

# figure out which one is a problem
problem_sample = rownames(cormat)[rowMeans(cormat) == min(rowMeans(cormat))]
problem_sample
# how to remove the problem sample if you want to
ma_mat_cleaned = ma_mat[, -which(colnames(ma_mat) == problem_sample)]
dim(ma_mat_cleaned)
# and if you do, don't for get to update sex, patient_id etc. all your metadata vectors to match
# but I'll leave it in for now.

# explore relationship between probes, genes and transcripts
length(unique(annot$ID)) # probes
length(unique(annot$Gene.symbol)) # gene symbols 
length(unique(annot$GenBank.Accession)) # GenBank Accession numbers, ~= RefSeq transcripts

# try grouping by gene, averaging
ma_gene_raw = aggregate(ma_mat ~ annot$Gene.symbol, FUN='mean')
ma_gene = as.matrix(ma_gene_raw[,2:26]) # shave off the extra column added by aggregate
rownames(ma_gene) = ma_gene_raw[,1] # set that extra column as the row names
colnames(ma_gene) = colnames(ma_mat)
dim(ma_gene) # 10274 25 - we have 10,274 genes and 25 samples

# plot the levels of the housekeeping gene GAPDH
png('barplot.gapdh.absolute.level.png')
barplot(ma_gene["GAPDH",],col='yellow',main='Absolute levels of GAPDH',cex.names=.5,las=2)
dev.off()
range(ma_gene["GAPDH",])
# figure out which one is so high
which(ma_gene["GAPDH",] > 40000)

# transform whole matrix to rank space
ma.rank = ma_gene
for (i in 1:25) {
  ma.rank[,i] = rank(-ma_gene[,i],na.last='keep')
}

png('barplot.gapdh.rank.png')
barplot(dim(ma.rank)[1] - ma.rank['GAPDH',], ylim = c(1,dim(ma.rank)[1]), yaxt='n', col='yellow', cex.names=.6,las=2, main = "GAPDH rank relative to other genes")
axis(side=2, at = c(1,5000,dim(ma.rank)[1]), labels=c("lowest","5000","1 (highest)") )
dev.off()

# compare to PRND, which is not a housekeeping gene
png('barplot.prnd.absolute.level.png')
barplot(ma_gene["PRND",],col='yellow',main='Absolute levels of PRND',cex.names=.5,las=2)
dev.off()
range(ma_gene["PRND",])
# 20.2 722.4

png('barplot.prnd.rank.png')
barplot(dim(ma.rank)[1] - ma.rank['PRND',], ylim = c(1,dim(ma.rank)[1]), yaxt='n', col='yellow', cex.names=.6,las=2, main = "PRND rank relative to other genes")
axis(side=2, at = c(1,5000,dim(ma.rank)[1]), labels=c("lowest","5000","1 (highest)") )
dev.off()

# test each and every probe for sex differences
sex.p.vals = vector(mode="numeric",length=dim(ma_mat)[1])
for (i in 1:dim(ma_mat)[1]) {
  sex.p.vals[i] = t.test(ma_mat[i,] ~ sex)$p.value
} # this loop takes ~ 1 minute

png('sex.probe.qqplot.png')
qqunif(sex.p.vals,pch=19,main='QQ plot of microarray probes associated with sex',cex.main=.8)
abline(h=-log10(.05/dim(ma_mat)[1]),col='darkgreen')
text(0,-log10(.05/dim(ma_mat)[1])+.25,labels='Bonferroni-corrected .05 significance threshold',col='darkgreen',pos=4,cex=.6)
dev.off()

# get lowest Bonferroni-corrected p value
min(sex.p.vals * dim(ma_mat)[1])
# get lowest FDR-corrected p value
fdr.corrected = p.adjust(sex.p.vals, method="hochberg")
min(fdr.corrected)

# which genes are sex-associated? which chromosome are they on?
unique(annot[annot$ID %in% rownames(ma_mat)[sex.p.vals < .05/dim(ma_mat)[1]], c("Chromosome","Gene.symbol")])

# select random subset of 10% of the probes to run PCA
set.seed(2222)
random.subset = ma_mat[runif(n=dim(ma_mat)[1],min=0,max=1) > .9,]
# now calculate principal components
tmat = t(random.subset) # transpose
sum(is.na(tmat)) # 0
tmat[is.na(tmat)] = 0 # convert NA to 0 for PCA
covmatrix = cov(tmat) # calculate covariance matrix
eig = eigen(covmatrix) # this takes about 1 minute 
pc = tmat %*% eig$vectors # multiply matrices to get principal components# eig = eigen(covmatrix)

# plot PC1 vs. PC2
png('pc1.vs.pc2.png')
plot(pc[1:25,1],pc[1:25,2],pch=19,main='PC1 vs. PC2',cex.main=.8,xlab='PC1',ylab='PC2')
points(pc[sex=='M',1],pc[sex=='M',2],pch=19,col='blue')
points(pc[sex=='F',1],pc[sex=='F',2],pch=19,col='red')
text(pc[1:25,1],pc[1:25,2],labels=colnames(ma_mat),pos=1)
dev.off()

# plot the variance explained by each PC
png('pc.variance.explained.png')
plot(1:50,(eig$values/sum(eig$values))[1:50],pch=19, xlab='PC#', ylab='Varaince explained', main='Variance explained by PCs') 
dev.off()

# alternate way is to look at cumulative variance explained
png('pc.cumulative.variance.explained.png')
plot(1:50,cumsum(eig$values/sum(eig$values))[1:50], pch=16, type='b', ylim=c(0,1), xlab='PC#', ylab='Cumulative varaince explained', main='Cumulative variance explained by PCs') # running total of variance explained
dev.off()

nrow = dim(random.subset)[1]
ncol = dim(random.subset)[2]
random.numbers = matrix(rnorm(n=nrow*ncol,m=0,sd=1),nrow=nrow,ncol=ncol)
random.numbers[1:5,1:5]

rtmat = t(random.numbers) # transpose
rcovmatrix = cov(rtmat) # calculate covariance matrix
reig = eigen(rcovmatrix) # this takes about 1 minute 
rpc = rtmat %*% reig$vectors # multiply matrices to get principal components# eig = eigen(covmatrix)
png('pc.variance.explained.in.random.numbers.png')
plot(1:50,(reig$values/sum(reig$values))[1:50],pch=19, ylim=c(0,.70), xlab='PC#', ylab='Varaince explained', main='Variance explained by PCs') 
dev.off()

# do any PCs correlate with sex?
t.test(pc[1:25,1] ~ sex) 
t.test(pc[1:25,2] ~ sex) 
t.test(pc[1:25,3] ~ sex) 
t.test(pc[1:25,4] ~ sex)
t.test(pc[1:25,5] ~ sex)

# combine with mRNA-seq data from blood from Human BodyMap 2.0
hbm_df = read.table('gene.matrix.csv',sep=',',header=TRUE,stringsAsFactors=FALSE) # load table
hbm_matrix = as.matrix(hbm_df[,2:17]) # cast the FPKM columns to a matrix
rownames(hbm_matrix) = hbm_df[,1] # assign gene symbols as row names
hbm_rel = melt(hbm_matrix) # cast the HumanBodyMap 2.0 matrix into a 3-column relational model
colnames(hbm_rel) = c('gene_id','tissue','fpkm') # set column names

ma_gene_rel = melt(ma_gene) # cast the microarray matrix into a 3-column relational model
colnames(ma_gene_rel) = c('gene_id','sample','level') # set column names

sql_query = "
select   ma.gene_id, ma.level, mrna.fpkm
from     (select gene_id, avg(level) level from ma_gene_rel group by gene_id) ma, 
         hbm_rel mrna
where    ma.gene_id = mrna.gene_id
and      mrna.tissue == 'blood'
;
"
ma_mrna_compare = sqldf(sql_query)

png('microarray.vs.humanbodymap.png')
plot(ma_mrna_compare$fpkm, ma_mrna_compare$level, pch=19, 
     xlab='Human BodyMap 2.0 Blood FPKMs', 
     ylab='Average level in microarray across 25 samples',
     main='Aggregate gene expression level, microarray vs. Human BodyMap 2.0 blood mRNA-seq',
     cex.main = .8)
dev.off()

cor.test(ma_mrna_compare$fpkm, ma_mrna_compare$level, method='pearson') # .58
cor.test(ma_mrna_compare$fpkm, ma_mrna_compare$level, method='spearman') # .65