# efg, Stowers Institute, 6 May 2006 # # Analyze Chronic Fatigue Syndrome SNP Data for Hardy-Weinberg Equilibrium. # # See http://en.wikipedia.org/wiki/Hardy-Weinberg_principle, especially section # "Significance tests for deviation. Note that "an extra degress of freedom # is lost because expected values were estimated from the observed values" and # there is only one degree of freedom in the chi squared analysis. # Also see http://www.tiem.utk.edu/~gross/bioed/bealsmodules/hardy-weinberg.html library(genetics) # http://www.warnes.net/Research/PresentationFolder/RGeneticsPackage.pdf filename <- "SNPNumeric.csv" SNP <- read.csv(filename, as.is=TRUE) dim(SNP) # For some reason 27369701 was relicated in "a" and "b" variants. # The "b" verson changed an "undetermined" to a single allele, so # "b" will be used instead of "a" for matching below SNP$ID[SNP$ID == "27369701b"] = "27369701" filename <- "Classification.csv" Classes <- read.csv(filename, as.is=TRUE)[,c(1,2,5,7)] dim(Classes) table(Classes$CLUSTER) Combined <- merge(Classes, SNP, by.x="ABTID", by.y="ID") dim(Combined) # cat("Look at dropout:") sort(SNP$ID[!SNP$ID %in% Combined$ABTID]) sort(Classes$ABTID[!Classes$ABTID %in% Combined$ABTID]) Basis <- Combined[Combined$CLUSTER %in% c("Least", "Middle", "Worst"),] rownames(Basis) <- Basis[,1] Basis <- Basis[,-1] HardyWeinberg <- function(title, SelectionList) { cat("\n================================================================================================\n") Nmale <- sum(Basis$sex[SelectionList] == "Male") Nfemale <- sum(Basis$sex[SelectionList] == "Female") cat(title, "-- Males = ", Nmale, ", Females=", Nfemale, "\n") cat("\nHardy-Weinberg Equilibrium Analysis\n") cat( " # SNP N N1 N2 N3 p q E1 E2 E3 chi sq HWE.chisq HWE.exact\n") cat( "-- -------------------- --- --- --- --- ------ ------ --- --- --- ------ ------------- ---------\n") for (j in 4:ncol(Basis)) { alleles <- Basis[SelectionList,j] N1 <- sum(alleles == 1) N2 <- sum(alleles == 2) N3 <- sum(alleles == 3) N <- N1 + N2 + N3 p <- (2*N1 + N3) / (2 * N) q <- 1 - p E1 <- p^2 * N E2 <- q^2 * N E3 <- 2 * p * q * N chi.squared = (N1-E1)^2/E1 + (N2-E2)^2/E2 + (N3-E3)^2/E3 # Use Bioconductor genetics package for chi squared and "exact" test g <- genotype( c("1/1","2/2","1/2")[ Basis[SelectionList,j] ] ) test.chisq <- HWE.chisq(g) marker1 <- ifelse(test.chisq$p.value < 0.05, "*", " ") test.exact <- HWE.exact(g) marker2 <- ifelse(test.exact$p.value < 0.05, "*", " ") cat( sprintf("%2d %-20s %3d %3d %3d %3d %6.3f %6.3f %3.0f %3.0f %3.0f %6.3f %6.3f %6.3f%s %6.3f%s\n", j-3, colnames(Basis)[j], N, N1, N2, N3, p, q, E1, E2, E3, chi.squared, test.chisq$statistic, test.chisq$p.value, marker1, test.exact$p.value, marker2) ) } cat("\n*\n Null Hypothesis that Observed Values Match Hardy-Weinberg Equilibrium REJECTED at 0.05 level\n") } HardyWeinbergAnalysis <- function() { # Overview HardyWeinberg("All Patients", 1:nrow(Basis)) HardyWeinberg("Female Patients", Basis$sex == "Female") HardyWeinberg("Male Patients", Basis$sex == "Male") # Worst HardyWeinberg("Worst Patients", Basis$CLUSTER == "Worst") HardyWeinberg("Worst Female Patients", Basis$CLUSTER == "Worst" & Basis$sex == "Female") # Sick HardyWeinberg("Sick Patients (Worst + Middle)", Basis$CLUSTER %in% c("Middle", "Worst")) HardyWeinberg("Sick Female Patients (Worst + Middle)", Basis$CLUSTER %in% c("Middle", "Worst") & Basis$sex == "Female") HardyWeinberg("Sick Male Patients (Worst + Middle)", Basis$CLUSTER %in% c("Middle", "Worst") & Basis$sex == "Male") # Onset: Gradual Vs Sudden SelectList <- Basis$CLUSTER %in% c("Middle", "Worst") & Basis$Onset == "Sudden" SelectList[is.na(SelectList)] <- FALSE HardyWeinberg("Sick + Sudden Onset", SelectList) SelectList <- Basis$CLUSTER %in% c("Middle", "Worst") & Basis$Onset == "Gradual" SelectList[is.na(SelectList)] <- FALSE HardyWeinberg("Sick + Gradual Onset", SelectList) SelectList <- Basis$CLUSTER %in% c("Middle", "Worst") & Basis$Onset == "Sudden" & Basis$sex == "Female" SelectList[is.na(SelectList)] <- FALSE HardyWeinberg("Sick Females + Sudden Onset", SelectList) SelectList <- Basis$CLUSTER %in% c("Middle", "Worst") & Basis$Onset == "Gradual" & Basis$sex == "Female" SelectList[is.na(SelectList)] <- FALSE HardyWeinberg("Sick Females + Gradual Onset", SelectList) # Least Group HardyWeinberg("Least Group", Basis$CLUSTER == "Least") HardyWeinberg("Least Female Group", Basis$CLUSTER == "Least" & Basis$sex == "Female") HardyWeinberg("Least Male Group", Basis$CLUSTER == "Least" & Basis$sex == "Male") } ################################################## sink("HardyWeinbergAnalysis.txt") print( table(Basis$sex) ) print( table(Basis$CLUSTER) ) print( table(Basis$CLUSTER, Basis$sex) ) print( table(Basis$CLUSTER, Basis$Onset) ) print( table(Basis$sex, Basis$Onset) ) HardyWeinbergAnalysis() sink()