# Match CAMDA Microarray data with Affective Disorder Database # efg, 4 April 2006. Updated 8 May 2006. sink("1-MatchBiomartWithAffectiveDisorderGenes.txt") basepath <- "C:/CAMDA06/" filename <- paste(basepath, "2-GeneSets/a-AffectiveDisorder/", "AffectiveDisorderGeneList.csv", sep="") AffectiveDisorder <- read.csv(filename, as.is=TRUE) nrow(AffectiveDisorder) # 257 length(unique(AffectiveDisorder$Gene)) # 257 filename <- paste(basepath, "3-biomaRt/", "GeneInfo-2006-04-28.csv", sep="") GeneInfo <- read.csv(filename, as.is=TRUE) nrow(GeneInfo) # 21,448 on 1/30; 21,950 on 4/28 missing <- GeneInfo[is.na(GeneInfo$gene),] nrow(missing) # 2,996 on 1/30; 2,948 on 4/28 GeneInfo <- GeneInfo[!is.na(GeneInfo$gene),] cat("probe records", nrow(GeneInfo), "\n") # 18,452 on 1/30; 19,002 on 4/28 cat("unique probes", length(unique(GeneInfo$probe)), "\n") # 16,526 on 1/30; 16,573 on 4/28 Genes <- unique(GeneInfo$gene) cat("unique genes", length(Genes), "\n") # 13,021 on 1/30; 13,094 on 4/28 Match <- intersect(unique(AffectiveDisorder$Gene), unique(Genes)) cat("Hattori Affective Disorder Gene Match", length(Match), "\n") # 238 (both dates) AffectiveDisorderMatch <- AffectiveDisorder[AffectiveDisorder$Gene %in% Match,] print( table(AffectiveDisorderMatch$Group) ) print( sum( table(AffectiveDisorderMatch$Group) ) ) print( table(trunc(AffectiveDisorderMatch$Group)) ) print( sum(table(trunc(AffectiveDisorderMatch$Group)) )) #write.csv(data.frame(Gene=Match), row.names=F, # file="GeneList.csv") cat("Hattori Genes Not Found:\n") NoMatch <- setdiff(unique(AffectiveDisorder$Gene), Match) print(length(NoMatch)) print(NoMatch) ProbeList <- GeneInfo$gene %in% Match ProbeInfo <- GeneInfo[ProbeList,] # Note the total number of probes is NOT 390 since the replicates # reported by biomaRt do not correspond to the actual array dim(ProbeInfo) # 390 8 cat("Total Matching Probes", nrow(ProbeInfo), "\n") ProbeCount <- table(ProbeInfo$probe) Duplicates <- names(ProbeCount[ProbeCount > 1]) length(Duplicates) # 17 DuplicateProbes <- ProbeInfo[ProbeInfo$probe %in% Duplicates,1:7] DuplicateProbes <- DuplicateProbes[order(DuplicateProbes$probe),] dim(DuplicateProbes) # 39 7 DuplicateProbes UniqueProbes <- ProbeInfo[!duplicated(ProbeInfo$prob),] cat("Unique Probe Names", nrow(UniqueProbes), "\n") dim(UniqueProbes) # 368 8 UniqueProbes <- UniqueProbes[order(UniqueProbes$gene),] HattoriGroups <- AffectiveDisorder[,1:3] HattoriGroups <- HattoriGroups[order(HattoriGroups$Gene),] colnames(HattoriGroups)[3] <- "gene" # Merge by ordered genes ProbeList <- merge(HattoriGroups, UniqueProbes) ProbeList <- ProbeList[,c(2,3,1,4:ncol(ProbeList))] # Order by Hattori group ProbeList <- ProbeList[order(ProbeList$Group, ProbeList$Seq),] print(dim(ProbeList)) write.csv(ProbeList, row.names=F, file="ProbeListUnique.csv") print( table(ProbeList$Group) ) print( sum(table(ProbeList$Group) )) sink()