# efg. Stowers Institute. # 7 Feb 2006. Updated 14 May 2006. sink("2-ExtractExpressionData.txt") # Patient Order (Load all patients) filename <- "C:/CAMDA06/1-Clinical/Classification.csv" patient <- read.csv(filename, as.is=TRUE) dim(patient) # 227 8 patient$order <- 1:nrow(patient) # Gene/Probe Order (List of only probes of interest) basepath <- "C:/CAMDA06/5-GeneExpressionAnalysis/b-PsychoNeuroendocrineImmune/" filename <- paste(basepath, "ProbeListUnique.csv", sep="") probe <- read.csv(filename, as.is=TRUE) dim(probe) # 1917 10 probe$order <- 1:nrow(probe) # Expression Data [Five "extra" arrays have already been excluded] basepath <- "C:/CAMDA06/4-GeneExpressionScaling/" filename <- paste(basepath, "PatientScaledExpression.csv", sep="") Expression <- read.csv(filename, as.is=TRUE) colnames(Expression)[1] <- "ID" dim(Expression) # 172 19701 Expression.match.ID <- as.numeric(substr(Expression$ID,1,8)) length(Expression.match.ID) # 172 PatientMatch <- patient$ABTID %in% Expression.match.ID patient <- patient[PatientMatch,] patient$order <- 1:nrow(patient) write.csv(patient, file="ArrayPatients.csv", row.names=FALSE) dim(patient) # 164 9 (8 duplicates) # For all Gene Expression patients, establish CLUSTER order for (i in 1:nrow(Expression)) { matchid <- as.numeric(substr(Expression$ID[i],1,8)) if (length( which(matchid == patient$ABTID) ) == 0) { Expression$Order[i] <- 0 # Should never happen } else { Expression$Order[i] <- patient$order[ which(matchid == patient$ABTID) ] } } Expression <- Expression[order(Expression$Order, Expression$ID),] dim(Expression) # 172 19702 # Find location of the 8 replicates. Label these "A" and +1 then "B" DuplicateOrder <- which(Expression$Order[1:(length(Expression$Order)-1)] == Expression$Order[2:length(Expression$Order)]) # List original filenames so "A" and "B" can be resolved later if necessary Expression$ID[DuplicateOrder] Expression$ID[DuplicateOrder+1] ABTID <- substr(Expression$ID,1,8) # Verify duplicates ABTID[DuplicateOrder] ABTID[DuplicateOrder+1] ABTID[DuplicateOrder] <- paste(ABTID[DuplicateOrder], "A", sep="") ABTID[DuplicateOrder+1] <- paste(ABTID[DuplicateOrder+1], "B", sep="") # Verify duplicates ABTID[DuplicateOrder] ABTID[DuplicateOrder+1] rownames(Expression) <- ABTID # get rid of these columns Expression$Order <- NULL Expression$ID <- NULL dim(Expression) # 172 19700 # Order probes by category/subcategory ProbeOrder <- rep(0, ncol(Expression)) ProbeGene <- rep("", ncol(Expression)) ProbeName <- rep("", ncol(Expression)) ProbeSuffix <- rep("", ncol(Expression)) ProbeSystem <- rep("", ncol(Expression)) # Establish gene / probe order based on PNI list for (j in 1:ncol(Expression)) { matchid <- colnames(Expression)[j] # In R the probes like "X (1)" became "X..1." parts <- strsplit(matchid, "\\.\\.") matchid <- parts[[1]][1] if (length(parts[[1]]) == 1) { suffix <- "" } else { suffix <- paste(".", substr( parts[[1]][2],1,1), sep="") } index <- which(matchid == probe$probe) if (length(index) > 0) { ProbeOrder[j] <- probe$order[index] ProbeGene[j] <- probe$gene[index] ProbeName[j] <- matchid ProbeSuffix[j] <- suffix ProbeSystem[j] <- probe$System[index] } } # Get rid of blank columns NonZeroColumns <- ProbeOrder > 0 Expression <- Expression[, NonZeroColumns] ProbeGene <- ProbeGene[NonZeroColumns] ProbeName <- ProbeName[NonZeroColumns] ProbeSuffix <- ProbeSuffix[NonZeroColumns] ProbeSystem <- ProbeSystem[NonZeroColumns] ProbeOrder <- ProbeOrder[NonZeroColumns] # Reorder everything now by order(ProbeOrder) Expression <- Expression[,order(ProbeOrder)] ProbeGene <- ProbeGene[order(ProbeOrder)] ProbeName <- ProbeName[order(ProbeOrder)] ProbeSuffix <- ProbeSuffix[order(ProbeOrder)] ProbeSystem <- ProbeSystem[order(ProbeOrder)] ProbeOrder <- ProbeOrder[order(ProbeOrder)] colnames(Expression) <- paste(ProbeGene, ".", ProbeName, ProbeSuffix, sep="") dim(Expression) # 172 1914 write.csv(Expression, file="SelectedProbes.csv") ProbeListAll <- data.frame(ID=colnames(Expression), Gene=ProbeGene, Name=ProbeName, Suffix=ProbeSuffix, System=ProbeSystem, Unique=ProbeOrder) ProbeListAll <- cbind(ProbeListAll, probe[ProbeListAll$Unique,]) ProbeListAll <- ProbeListAll[, -c(6,7,10,17)] write.csv(ProbeListAll, file="ProbeListAll.csv") sink()