# efg, Stowers Institute for Medical Research, December 2005 # Read All probes. # # Problems reading GeneExpression data in R: # # (1) R doesn't like the header labels in the files,e .g., R does not # like blanks, or percent signs (%). Instead of attempting an # automatic "fix" to the column names, the names are simply # assigned (Mostly with blanks removed, % -> PC). # # (2) Some of the SpotLabels contain a "#", which by default R treats as a # comment. Use comment.char="" in read.delim to avoid this problem. # # (3) All but one of the GeneExpression files have a tab separator at the end # of each line. Since R treats tabs as separators, this final tab # resulted in an extra column of NAs. Apparently one file, # 22149604A-03.txt, was opened in an editor that removed this final tab, # and then resaved.) Special logic was added to detect and delete a 17 # column of NAs with a name of "X" in all but this one file. # # Comments from Suzanne Vernon, Centers for Disease Control and Prevention, # in private E-mail, 12/16/2005: # "As for the strange file, we have it too and unfortunately we # are not sure how it got there. It was obviously overlooked when # we submitted the final data to CAMDA - and we are very sorry about # that. You should ignore or delete this file and you can note that # on your website too." # # (4) The file 22149604A-03.txt is more than a half a megabyte smaller than # The other files. Like mentioned above in (3), this file seems to have # been edited by some program that not only stripped the trailing tabs, # but changed many of the "0.000" values to simply a "0". This file # has about 500,000 fewer zeros, 170,000 fewer periods, and 20,000 fewer # tabs than the rest of the files. switch(.Platform$OS.type, windows = OS.Prefix <- "U:", unix = OS.Prefix <- "/n/projects", stop("unsupported OS platform") ) basepath <- paste(OS.Prefix, "/camda/2006/gene_expression_values", sep="") AllExpression <- function() { Load.Start <- proc.time()[3] txtfile.list <- list.files(path=basepath, pattern=".txt") ColumnNames <- c("Spot", "sARMDens") for (k in 1:length(txtfile.list)) { filename <- txtfile.list[k] cat(k, " ", filename) d <- read.delim(paste(basepath, "/", filename,sep=""), comment.char="", as.is=TRUE)[,c(1,10)] # Make sure data.frame "d" has the correct dimensions stopifnot( ncol(d) == 2 ) stopifnot( nrow(d) == 20160 ) # Save Row and Column names for first dataset. if (k == 1) { RawData <- matrix(0, length(txtfile.list), nrow(d)) rownames(RawData) <- gsub(".txt", "", txtfile.list) colnames(RawData) <- d$Spot.labels } # Compare Row names against first dataset stopifnot( all(colnames(RawData) == d$Spot.Labels) ) cat("\n") flush.console() # Save in Raw list (3D numeric array was too slow) after dropping column 1, # which is rendundant with the rownames RawData[k,] <- d$sARMDens } # Load Time Load.End <- proc.time()[3] write.csv(data.frame(RawData), file="AllExpression.csv") #write.csv(data.frame(t(RawData)), file="AllExpressionTransposed.csv") Save.End <- proc.time()[3] cat("Time to load:", Load.End - Load.Start, "sec\n") cat("Time to save:", Save.End - Load.End, "sec \n") } AllExpression()