# efg, Stowers Institute, 5 Oct 2005. Updated 11 Jan 2006 setwd("U:/camda/2006/clinical_data") library(RODBC) # available under Linux? connection <- odbcConnectExcel("Complete Blood Evaluation.xls") sqlTables(connection) #d <- sqlFetch(connection , "Blood Profile$") blood <- sqlQuery(connection, "select * from [Blood Profile$]", as.is=TRUE) close(connection) print( dim(blood) ) print( names(blood) ) print( length(blood$ABTID) ) print( length(unique(blood$ABTID)) ) print(blood$"Collection Date Blood"[order(blood$"Collection Date Blood")] ) # hist( as.Date(blood$"Collection Date Blood", "%m/%d/%y") , breaks=8) # note misspelling in filename of "classification" connection <- odbcConnectExcel("Illness Classificatoin SF36 MFI and Symptoms.xls") sqlTables(connection) # For some reason blank lines are read if condition ABTID <> NULL is absent. illness <- sqlQuery(connection, "select * from [Class Demo$] where ABTID <> NULL", as.is=TRUE) close(connection) print(dim(illness)) print(names(illness)) print( length(unique(illness$ABTID)) ) clinical <- merge(blood, illness) print(dim(clinical)) print(names(clinical)) cat("Clinical\n") FullList <- clinical$ABTID print(length(FullList)) NoBlood <- clinical$ABTID[clinical$"Collection Date Blood" == ""] NoBlood print(length(NoBlood)) WithBlood <- setdiff(FullList, NoBlood) print(length(WithBlood)) print(length(unique(WithBlood))) # Gene Expression cat("Gene Expression\n") txt.list <- list.files(path="U:/camda/2006/gene_expression_data/Gene Expression", pattern=".txt") txt.list <- gsub(".txt", "", txt.list) cat("txt", length(txt.list), "\n" ) txt.unique <- unique(substr(txt.list,1,8)) print (length(txt.unique)) tif.list <- list.files(path="U:/camda/2006/gene_expression_data/Gene Expression", pattern=".tif") tif.list <- gsub(".tif", "", tif.list) cat("tif", length(tif.list), "\n" ) tif.unique <- unique(substr(tif.list,1,8)) print (length(tif.unique)) cat(" txt files without tifs\n") TxtNoTif <- setdiff(substr(txt.list,1,8), substr(tif.list,1,8)) print(TxtNoTif) GeneExpressionNoClinical <- setdiff(txt.unique, WithBlood) GeneExpressionNoClinical setdiff(tif.unique, WithBlood) cat(" txt duplicates\n") CountExpression <- table(substr(txt.list,1,8)) replicates <- names(CountExpression[CountExpression > 1]) replicates print("Gene Expression Replicates") print( txt.list[ substr(txt.list,1,8) %in% replicates ] ) print( length( intersect(WithBlood, txt.unique) ) ) intersect(FullList, GeneExpressionNoClinical) # Use Reeves 2005 paper to define set of 164 patients without any exlusions NoExclusions <- clinical[ !is.na(clinical$CLUSTER), c("ABTID", "CLUSTER") ] write.csv(NoExclusions, file="NoExclusions.csv") write.csv(NoExclusions, row.names=F, file="NoExclusions.csv")