# efg, 6 Feb 2006 #setwd("U:/camda/2006/snp_data/SNP Data") library(RODBC) # available under Linux? connection <- odbcConnectExcel("CAMDA-SNP-Genotype data 4-14-05.xls") #worksheets <- sqlTables(connection)$TABLE_NAME SNP.DATA <- NULL worksheet.set <- c("TPH2", "SLC6A4", "MAOA", "MAOB", "TH", "COMT", "POMC", "CRHR1", "CRHR2", "NR3C1") for (i in 1:length(worksheet.set)) { item <- worksheet.set[i] worksheet <- sqlFetch(connection,item, as.is=TRUE) cat(i, item, ncol(worksheet)-1, "\n") stopifnot(nrow(worksheet) == 223) # make sure all have same number or rows # kludge fixup for "a" and "b" on IDs class(worksheet[,1]) <- "character" worksheet[c(30,31),1] <- c("27369701a", "27369701b") # add gene ID to column names colnames(worksheet)[2:ncol(worksheet)] <- paste(item, ".", colnames(worksheet)[2:ncol(worksheet)], sep="") if (i == 1) { SNP.DATA <- worksheet } else { stopifnot( all(SNP.DATA[,1] == worksheet[,1]) ) # make sure all IDs match SNP.DATA <- cbind(SNP.DATA, worksheet) # Kludge to get rid of duplicate ID columns and avoid problem when ncol(worksheet) is 2 SNP.DATA <- SNP.DATA[,-(ncol(SNP.DATA)-ncol(worksheet)+1)] } print( table( unlist( worksheet[,2:ncol(worksheet)] ) ) ) } colnames(SNP.DATA)[1] <- c("ID") dim(SNP.DATA) close(connection) table( unlist( SNP.DATA[,2:ncol(SNP.DATA)] ) ) SNP.DATA[SNP.DATA == "allele 1"] <- "allele1" SNP.DATA[SNP.DATA == "allele 2"] <- "allele2" table( unlist( SNP.DATA[,2:ncol(SNP.DATA)] ) ) write.csv(SNP.DATA, row.names=FALSE, file="SNPDATA.csv") SNP.DATA[SNP.DATA == "allele1"] <- 1 SNP.DATA[SNP.DATA == "allele2"] <- 2 SNP.DATA[SNP.DATA == "Both"] <- 3 SNP.DATA[SNP.DATA == "Undetermined"] <- 0 write.csv(SNP.DATA, row.names=FALSE, file="SNPNumeric.csv")