###################################################################### # Analyze "All" CAMDA '06 GeneExpression data # efg, Rewritten 23 March 2006. Updated 25 April 2006. # # Load all programs/data for analysis: # source("0-LoadData.R") # # LoadAllData() # - creates global variables d & log2d for subsequent analysis: 177 by 20160 # - creates global variables ControlSet1 .. 4, and ControlSet to identify # control probes in the microarray datasets # - creates global variable "clusters" of selected data about 227 patients, # including "CLUSTER" # - creates global variable "Info" of selected data about 177 patients with # microarray data ###################################################################### # 177 rows by 20160 columns ReadData <- function() { raw <- read.csv("AllExpression.csv") d <<- data.matrix(raw[,2:ncol(raw)]) # Make global rownames(d) <<- raw[,1] log2d <<- log2(1+d) # log2(1 + d) used to avoid problems with zero values ControlSet1 <<- substr(colnames(log2d), 1, 13) == "mwgaracontrol" ControlSet2 <<- substr(colnames(log2d), 1, 13) == "mwghumcontrol" ControlSet3 <<- substr(colnames(log2d), 1, 8) == "mwghuman" ControlSet4 <<- substr(colnames(log2d), 1, 5) == "Blank" ControlSet <<- ControlSet1 | ControlSet2 | ControlSet3 | ControlSet4 stopifnot( all( dim(d) == c(177, 20160) )) } SetupInfo <- function() { # 227 rows by 8 columns filename <- "C:/CAMDA06/1-Clinical/Classification.csv" # From survey data clusters <- read.csv(filename, as.is=TRUE) clusters <- clusters[order(clusters$ABTID),] # Form Info structure to match rows of d and log2d # (probably a better way to do this than one by one) Info <- clusters[0,] for (patient in 1:nrow(d)) { ABTID <- substr(rownames(d)[patient],1,8) x <- clusters[clusters$ABTID %in% ABTID,] if (nrow(x) == 1) { Info[patient,] <- x rownames(Info)[patient] <- rownames(d)[patient] } else { cat("Could not match patient row", patient, ABTID, "\n") } } # Exclude 5 arrays that have no clinical data Info <- Info[!is.na(Info$ABTID),] print(dim(Info)) stopifnot( all(dim(Info) == c(172,8))) # Make global Info <<- Info invisible(Info) } LoadAllData <- function() { ReadData() # 177 arrays SetupInfo() # 172 arrays with clinical data # Trim d and log2d to only have array data when # matching clinical data are available ArraysWithClinicalSet <- rownames(d) %in% rownames(Info) d <<- d[ArraysWithClinicalSet,] log2d <<- log2d[ArraysWithClinicalSet,] stopifnot( all( dim(d) == c(172, 20160) )) stopifnot( all(rownames(Info) == rownames(d) ) ) stopifnot( all(rownames(Info) == rownames(log2d) ) ) print( table(Info$CLUSTER, Info$sex) ) # Female Male # Excluded 43 6 # Least 32 12 # Middle 40 13 # Worst 23 3 } ###################################################################### PlotFooter <- function(footer) { mtext(paste(footer, format(Sys.time(), "%Y %b %d %H:%M")), SOUTH<-1, adj=0.025, outer=TRUE, cex=0.8, line=-0.5, col="blue") } ###################################################################### sink("1-LoadData.txt") LoadAllData() sink()