# Match CAMDA Microarray data with PNI database # efg, 4 April 2006. Updated 10 May 2006. sink("1-MatchBiomartWithPNIGenes.txt") basepath <- "C:/CAMDA06/" filename <- paste(basepath, "2-GeneSets/b-PsychoNeuroendocrineImmune/", "CDC-Psycho-Neuroendocrine-Immune.csv", sep="") PNI <- read.csv(filename, as.is=TRUE) nrow(PNI) # 1622 length(unique(PNI$Gene)) # 1622 PNI$order <- 1:nrow(PNI) filename <- paste(basepath, "3-biomaRt/", "GeneInfo-2006-04-28.csv", sep="") GeneInfo <- read.csv(filename, as.is=TRUE) nrow(GeneInfo) # 21,448 on 1/30; 21,950 on 4/28 missing <- GeneInfo[is.na(GeneInfo$gene),] nrow(missing) # 2,996 on 1/30; 2,948 on 4/28 GeneInfo <- GeneInfo[!is.na(GeneInfo$gene),] cat("probe records", nrow(GeneInfo), "\n") # 18,452 on 1/30; 19,002 on 4/28 cat("unique probes", length(unique(GeneInfo$probe)), "\n") # 16,526 on 1/30; 16,573 on 4/28 Genes <- unique(GeneInfo$gene) cat("unique genes", length(Genes), "\n") # 13,021 on 1/30; 13,094 on 4/28 Match <- intersect(unique(PNI$Gene), unique(Genes)) cat("PNI Gene Match", length(Match), "\n") # 1350 PNIMatch <- PNI[PNI$Gene %in% Match,] print( table(PNIMatch$System) ) print( sum( table(PNIMatch$System) ) ) cat("PNI Genes Not Found:\n") NoMatch <- setdiff(unique(PNI$Gene), Match) print(length(NoMatch)) print(NoMatch) ProbeList <- GeneInfo$gene %in% Match ProbeInfo <- GeneInfo[ProbeList,] dim(ProbeInfo) # 2173 8 cat("Total Matching Probes", nrow(ProbeInfo), "\n") ProbeCount <- table(ProbeInfo$probe) Duplicates <- names(ProbeCount[ProbeCount > 1]) length(Duplicates) # 17 DuplicateProbes <- ProbeInfo[ProbeInfo$probe %in% Duplicates,1:7] DuplicateProbes <- DuplicateProbes[order(DuplicateProbes$probe),] dim(DuplicateProbes) # 39 7 DuplicateProbes UniqueProbes <- ProbeInfo[!duplicated(ProbeInfo$prob),] cat("Unique Probe Names", nrow(UniqueProbes), "\n") dim(UniqueProbes) # 1917 8 UniqueProbes <- UniqueProbes[order(UniqueProbes$gene),] PNIGroups <- PNI[,c(1, 4,5,7)] PNIGroups <- PNIGroups[order(PNIGroups$Gene),] colnames(PNIGroups)[1] <- "gene" # Merge by ordered genes ProbeList <- merge(PNIGroups, UniqueProbes) # Order by PNI order ProbeList <- ProbeList[order(ProbeList$order),] ProbeList$order <- NULL print(dim(ProbeList)) write.csv(ProbeList, row.names=F, file="ProbeListUnique.csv") print( table(ProbeList$Category, ProbeList$System) ) sink()