# efg, 8 May 2006. Updated 1 June 2006. # Stowers Institute for Medical Research # # Use 25,000 simulated annealing iterations with 500 baggs. library(logicFS) # Setup for Logic Regression Analysis filename <- "SNPNumeric.csv" SNP <- read.csv(filename, as.is=TRUE) dim(SNP) # 223 43 # For some reason 27369701 was relicated in "a" and "b" variants. # The "b" verson changed an "undetermined" to a single allele, so # "b" will be used instead of "a" for matching below SNP$ID[SNP$ID == "27369701b"] = "27369701" filename <- "Classification.csv" Classes <- read.csv(filename, as.is=TRUE)[,c(1,2,5,7)] dim(Classes) # 227 4 table(Classes$CLUSTER) # Excluded Least Middle Worst # 63 67 67 30 Combined <- merge(Classes, SNP, by.x="ABTID", by.y="ID") dim(Combined) # 216 46 # cat("Look at dropout:") sort(SNP$ID[!SNP$ID %in% Combined$ABTID]) #[1] "22149604" "22340204" "23366103" "24120805" "24296203" "27369701a" "28293201" sort(Classes$ABTID[!Classes$ABTID %in% Combined$ABTID]) # [1] 10372601 20086501 21193404 21991602 22696801 23229701 24478902 24684401 25058802 26834601 28613202 Basis <- Combined[Combined$CLUSTER %in% c("Least", "Middle", "Worst"),] rownames(Basis) <- Basis[,1] Basis <- Basis[,-1] dim(Basis) # 160 45 # Break into "info" and "SNP" data BasisInfo <- Basis[,1:3] BasisSNP <- as.matrix(Basis[,4:ncol(Basis)]) dim(BasisInfo) # 160 3 dim(BasisSNP) # 160 42 table(BasisInfo$CLUSTER) # Least Middle Worst # 64 66 30 table(BasisInfo$Onset, BasisInfo$sex) # Female Male # 41 7 # Gradual 76 24 # Sudden 8 2 CompareTwoLevels <- function(my.anneal, Classifier, SNPdata, title, seed=19, TreeCount=500, LeafCount=40) { cat("****************************************************\n") title <- paste(title, "[", seed, ", ", my.anneal$start, ",", my.anneal$end, ",", my.anneal$iter, "]", sep="") cat(paste(title, "\n")) print( dim(SNPdata) ) # Exclude any row with a missing value RowSelect <- apply(SNPdata == 0, ROW<-1, sum) == 0 ExcludeRowCount <- nrow(SNPdata) - sum(RowSelect) print(paste("Missing value rows excluded = ", ExcludeRowCount, " (", round(100*ExcludeRowCount/nrow(SNPdata),1), "%)", sep="")) ColumnSelect <- apply(SNPdata == 0, COLUMN<-2, sum) == 0 ExcludeColumnCount <- ncol(SNPdata) - sum(ColumnSelect) print(paste("Missing value columns would have been = ",ExcludeColumnCount, " (", round(100*ExcludeColumnCount/ncol(SNPdata),1), "%)", sep="")) SNPdata <- SNPdata[RowSelect,] Classifier <- Classifier[RowSelect] print(table(Classifier)) # Recoded for consistency between CAMDA data and logicFs definitions # This is a bit of a kludge. # CAMDA Allele 1 -> 1 # CAMDA Allele 2 -> 3 # CAMDA Allele 1+2 = 3 -> 2 xTemp <- c(1,3,2 )[ SNPdata ] dim(xTemp) <- dim(SNPdata) colnames(xTemp) <- colnames(SNPdata) rownames(xTemp) <- rownames(SNPdata) SNPdataFixed <- xTemp bin.snps <- make.snp.dummy(SNPdataFixed) print( dim(bin.snps) ) bag.out <- logic.bagging(bin.snps, Classifier, B=TreeCount, nleaves=LeafCount, rand=seed, anneal.control=my.anneal) print(bag.out); print(bag.out$vim, topX=20) plot(bag.out, coded=TRUE); mtext(title, NORTH<-3, outer=TRUE, line=1) prediction <- predict(bag.out, bin.snps) cat(paste("Misclassification rate = ", sprintf("%.2f", 100*mean(prediction != Classifier)), "%\n", sep="")) } StartTime <- proc.time() # Run Parameters seed <- 19937 AnnealStart <- 2 AnnealEnd <- -2 AnnealIterations <- 25000 # Ruczinski in LogicReg package: use 25000 iterations or more; # use of 2500 is only to have the examples run fast my.anneal <- logreg.anneal.control(start=AnnealStart, end=AnnealEnd, iter=AnnealIterations) pdf(paste("SNP-BaggedLogicRegression-Run1-", seed, ".pdf", sep=""), width=10, height=7.5) oldpar <- par( oma=c(1,0,2,0) ) sink(paste("SNP-BaggedLogicRegression-Run1-", seed, ".txt", sep="")) #Example setup for debugging #TargetSet <- c("Least", "Worst") #SelectedRows <- BasisInfo$CLUSTER %in% TargetSet #Classifier <- ifelse(BasisInfo$CLUSTER[SelectedRows] == "Least", 0, 1) #SNPdata <- BasisSNP[SelectedRows,] #title <- paste("Worst vs Least (", seed, ")", sep="") #CompareTwoLevels(my.anneal, Classifier, SNPdata, title, seed) SelectedRows <- BasisInfo$CLUSTER %in% c("Least", "Worst") Exclude.MAO.SNPs <- substr(colnames(BasisSNP),1,3) != "MAO" CompareTwoLevels(my.anneal, ifelse(BasisInfo$CLUSTER[SelectedRows] == "Least", 0, 1), # Classifier BasisSNP[SelectedRows,Exclude.MAO.SNPs], "Worst vs Least", seed) SelectedRows <- (BasisInfo$CLUSTER %in% c("Least", "Worst")) & (BasisInfo$sex == "Female") Exclude.MAO.SNPs <- substr(colnames(BasisSNP),1,3) != "MAO" CompareTwoLevels(my.anneal, ifelse(BasisInfo$CLUSTER[SelectedRows] == "Least", 0, 1), # Classifier BasisSNP[SelectedRows,], # Include MAOA and MAOB "Worst vs Least (Females Only)", seed) SelectedRows <- BasisInfo$CLUSTER %in% c("Least", "Middle") CompareTwoLevels(my.anneal, ifelse(BasisInfo$CLUSTER[SelectedRows] == "Least", 0, 1), BasisSNP[SelectedRows,Exclude.MAO.SNPs], "Middle vs Least", seed) SelectedRows <- BasisInfo$CLUSTER %in% c("Middle", "Worst") CompareTwoLevels(my.anneal, ifelse(BasisInfo$CLUSTER[SelectedRows] == "Middle", 0, 1), BasisSNP[SelectedRows,Exclude.MAO.SNPs], "Worst vs Middle", seed) SelectedRows <- BasisInfo$CLUSTER %in% c("Least", "Middle", "Worst") CompareTwoLevels(my.anneal, ifelse(BasisInfo$CLUSTER[SelectedRows] == "Least", 0, 1), BasisSNP[SelectedRows,Exclude.MAO.SNPs], "Sick (Worst + Middle) vs Least", seed) # Sick: Female Vs Maile # (two few for "Worst" Female vs Male) SelectedRows <- BasisInfo$CLUSTER %in% c("Worst", "Middle") CompareTwoLevels(my.anneal, ifelse(BasisInfo$sex[SelectedRows] == "Female", 0, 1), BasisSNP[SelectedRows,Exclude.MAO.SNPs], "Sick: Female vs Male", seed) #Random SelectedRows <- BasisInfo$CLUSTER %in% c("Least", "Middle", "Worst") CompareTwoLevels(my.anneal, round(runif(length(SelectedRows)) ), BasisSNP[SelectedRows,Exclude.MAO.SNPs], "Random Classification", seed) par(oldpar) dev.off() cat((proc.time() - StartTime)[1:3], "\n") sink()