# efg, Stowers Insititute for Medical Research # 14 May 2006 sink("3-AnalyzeExpressionData.txt") library(boot) ##################################################################### # Wrapper to avoid exception when means are equal. t.test.wrapper <- function(x,y) { if (mean(x) == mean(y)) { return(1.0) } else { return( t.test(x,y)) } } ##################################################################### # Hard-code 10,000 monte carlo runs DiffMeans <- function(x, d, N) { return( mean( x[ d[1:N] ] ) - mean( x[ d[(N+1):length(d)] ] ) ) } permute.boot <- function(x,y) { # Ignore missing data x <- x[!is.na(x)] y <- y[!is.na(y)] ThetaHat <- mean(x) - mean(y) b <- boot(c(x,y), DiffMeans, R=10000, sim="permutation", N=length(x)) # two-tail value p <- sum(abs(b$t) > abs(ThetaHat)) / b$R p } ##################################################################### # This Patient List doesn't deal with the 8 replicates. Find the Patient IDs # of first and last Worst, Middle, Least patient and find those in the Expression data. Patient <- read.csv("ArrayPatients.csv", row.names=1, as.is=TRUE) dim(Patient) ClusterDelta <- which(Patient$CLUSTER[1:(nrow(Patient)-1)] != Patient$CLUSTER[2:nrow(Patient)]) WorstFirst <- rownames(Patient)[1] WorstLast <- rownames(Patient)[ClusterDelta[1]] cat("Worst", WorstFirst, WorstLast, "\n") MiddleFirst <- rownames(Patient)[ClusterDelta[1]+1] MiddleLast <- rownames(Patient)[ClusterDelta[2]] cat("Middle", MiddleFirst, MiddleLast, "\n") LeastFirst <- rownames(Patient)[ClusterDelta[2]+1] LeastLast <- rownames(Patient)[ClusterDelta[3]] cat("Least", LeastFirst, LeastLast, "\n") ExcludedFirst <- rownames(Patient)[ClusterDelta[3]+1] ExcludedLast <- rownames(Patient)[nrow(Patient)] cat("Excluded", ExcludedFirst, ExcludedLast, "\n") Probe <- read.csv("ProbeListAll.csv", row.names=2, as.is=TRUE) Probet <- Probe[,-1] # get rid of column 1 dim(Probe) Expression <- data.matrix(read.csv("SelectedProbes.csv", row.names=1, as.is=TRUE)) dim(Expression) stopifnot( nrow(Patient) + 8 == nrow(Expression)) # 8 replicates stopifnot( ncol(Expression) == nrow(Probe) ) # Relax for now. In handful of cases, "." versus "-" do no match. #stopifnot( colnames(Expression) == rownames(Probe) ) IDindex <- function(ID) { which(rownames(Expression) == ID) } WorstRange <- c(IDindex(WorstFirst) : IDindex(WorstLast)) MiddleRange <- c(IDindex(MiddleFirst) : IDindex(MiddleLast)) LeastRange <- c(IDindex(LeastFirst) : IDindex(LeastLast)) ExcludedRange <- c(IDindex(ExcludedFirst) : IDindex(ExcludedLast)) cat("Worst", WorstRange, "\n") cat("Middle", MiddleRange, "\n") cat("Least", LeastRange, "\n") cat("Excluded", ExcludedRange, "\n") set.seed(19937) # Make this analysis repeatable ttestWL <- NA # A bit of a kludge to form stats below wtestWL <- NA ttestML <- NA wtestML <- NA ttestWM <- NA wtestWM <- NA alpha <- 0.05 # Desired significance level stats <- NULL info <- NULL for (j in 1:ncol(Expression)) { cat(j, colnames(Expression)[j], "\n") flush.console() Worst <- Expression[WorstRange, j] Middle <- Expression[MiddleRange, j] Least <- Expression[LeastRange, j] kruskalWML <- kruskal.test( list(Worst, Middle, Least) ) # Don't bother with 2-way tests if 3-way comparison is not significant if (kruskalWML$p.value <= alpha) { ttestWL <- t.test.wrapper(Worst, Least) wtestWL <- wilcox.test(Worst, Least) ptestWL <- permute.boot(Worst, Least) ttestML <- t.test.wrapper(Middle, Least) wtestML <- wilcox.test(Middle, Least) ptestML <- permute.boot(Middle, Least) ttestWM <- t.test.wrapper(Worst, Middle) wtestWM <- wilcox.test(Worst, Middle) ptestWM <- permute.boot(Worst, Middle) } else { ttestWL$p.value <- NA # drop from further analysis wtestWL$p.value <- NA ptestWL <- NA ttestML$p.value <- NA wtestML$p.value <- NA ptestML <- NA ttestWM$p.value <- NA wtestWM$p.value <- NA ptestWM <- NA } # Two-Way Comparison for Sick Vs. Least Sick <- c(Worst, Middle) ttestSL <- t.test.wrapper(Sick, Least) wtestSL <- wilcox.test(Sick, Least) ptestSL <- permute.boot(Sick, Least) stats <- rbind(stats, c(mean(Worst), mean(Middle), mean(Least), mean(c(Worst,Middle)), kruskalWML$p.value, ttestWL$p.value, wtestWL$p.value, ptestWL, ttestML$p.value, wtestML$p.value, ptestML, ttestWM$p.value, wtestWM$p.value, ptestWM, ttestSL$p.value, wtestSL$p.value, ptestSL)) info <- rbind(info, c(Probe$System[j], Probe$Category[j], Probe$Name[j])) } colnames(stats) <- c("mean.Worst", "mean.Middle", "mean.Least", "mean.Sick", "kruskal.p", "WL.t.p", "WL.rank.p", "WL.perm.p", "ML.t.p", "ML.rank.p", "ML.perm.p", "WM.t.p", "WM.rank.p", "WM.perm.p", "SL.t.p", "SL.rank.p", "SL.perm.p") colnames(info) <- c("System", "Category", "Probe") rownames(stats) <- colnames(Expression) write.csv(cbind(stats,info), file="ExpressionStats1Raw.csv", row.names=TRUE) dim(stats) # Dunn-Sidak Family-Wise Error Rate (FWER) Adjustment # Only adjust 3-way WML comparisons. Don't modify 2-way Sick-Least comparison. pStatRange <- 6:(ncol(stats)-3) # Exclude Sick-Least p values for (j in 1:nrow(stats)) { stats[j, pStatRange] <- 1.0 - (1.0 - stats[j,pStatRange])^3 # Dunn-Sidak adjustment } write.csv(cbind(stats,info), file="ExpressionStats2DunnSidak.csv", row.names=TRUE) dim(stats) # Make Multiple Test Correstion by System-Category stopifnot(nrow(Probe) == nrow(stats)) System <- paste(Probe$System, Probe$Category) SystemCount <- table(System) SystemCount write.csv(SystemCount, file="SystemCounts.csv", row.names=TRUE) for (j in 6:(ncol(stats))) { for (k in 1:length(SystemCount)) { SystemIndex <- which(System == names(SystemCount)[k]) stats[SystemIndex, j] <- p.adjust( as.double(stats[SystemIndex, j]), method="BH") } } write.csv(cbind(stats,info), file="ExpressionStats3SystemAdjust.csv", row.names=TRUE) # Final "tidy" list stats <- as.data.frame(stats) # cannot use stats$kruskal.p on double data.matrix KeepList <- stats$kruskal.p <= alpha info <- info[KeepList,] stats <- stats[KeepList,] # Keep only those passing Kruskal-Wallis # Could not do earlier because of "Sick-Least" comparisons dim(stats) WL <- ifelse(stats$WL.rank.p <= alpha, "X", "") ML <- ifelse(stats$ML.rank.p <= alpha, "X", "") WM <- ifelse(stats$WM.rank.p <= alpha, "X", "") SL <- ifelse(stats$SL.rank.p <= alpha, "X", "") LogRatioWL <- stats$mean.Least / stats$mean.Worst LogRatioML <- stats$mean.Least / stats$mean.Middle LogRatioWM <- stats$mean.Middle / stats$mean.Worst LogRatioSL <- stats$mean.Least / stats$mean.Sick stats <- cbind(stats, LogRatioWL, LogRatioML, LogRatioWM, LogRatioSL, info, WL, ML, WM, SL) dim(stats) write.csv(stats, file="ExpressionStats4Final.csv", row.names=TRUE) sink()