[adegenet-commits] r742 - in pkg: . R src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Wed Dec 22 17:12:52 CET 2010


Author: jombart
Date: 2010-12-22 17:12:52 +0100 (Wed, 22 Dec 2010)
New Revision: 742

Added:
   pkg/src/SNPbin.c
Modified:
   pkg/DESCRIPTION
   pkg/R/SNPbin.R
Log:
Begining of the SNPbin class. Bit-storage and C conversion work.
Bloody fast and efficient.


Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION	2010-12-14 15:30:52 UTC (rev 741)
+++ pkg/DESCRIPTION	2010-12-22 16:12:52 UTC (rev 742)
@@ -10,6 +10,6 @@
 Suggests: ade4, genetics, hierfstat, spdep, tripack, ape, pegas, graph, RBGL, seqinr
 Depends: methods, MASS
 Description: Classes and functions for genetic data analysis within the multivariate framework.
-Collate: classes.R basicMethods.R handling.R auxil.R setAs.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R  colorplot.R  monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R  inbreeding.R zzz.R
+Collate: classes.R basicMethods.R handling.R auxil.R setAs.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R  colorplot.R  monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R  inbreeding.R SNPbin.R zzz.R
 License: GPL (>=2)
 LazyLoad: yes

Modified: pkg/R/SNPbin.R
===================================================================
--- pkg/R/SNPbin.R	2010-12-14 15:30:52 UTC (rev 741)
+++ pkg/R/SNPbin.R	2010-12-22 16:12:52 UTC (rev 742)
@@ -1,18 +1,178 @@
+##
+## NOTE :
+## it is possible to recode each set of 8 binary values on a basis like 'basbin'
+## each genotype of 8 snp is then uniquely mapped to a number on 0:255
+## basbin <- 2^(0:7)
+## all(sapply(apply(SNPCOMB, 1, function(e) basbin[as.logical(e)]), sum) == 0:255)
+##
+##
+##
+
+
+
+####################
+## SNPbin class
+####################
+setClass("SNPbin", representation(snp = "raw",
+                                  n.loc = "integer",
+                                  NA.posi = "integer",
+                                  label = "charOrNULL"),
+         prototype(snp = raw(0), n.loc = integer(0), label = NULL))
+
+
+
+
+
+####################
+## SNPbin constructor
+####################
+setMethod("initialize", "SNPbin", function(.Object, ...) {
+    x <- .Object
+    input <- list(...)
+    if(length(input)==1) names(input) <- "snp"
+
+
+    ## handle snp data ##
+    if(!is.null(input$snp)){
+        if(is.raw(input$snp)){
+            x at snp <-input$snp
+        }
+
+        ## conversion from a vector of 0/1 (integers)
+        if(is.numeric(input$snp) | is.integer(input$snp)){
+            temp <- na.omit(unique(input$snp))
+            if(!all(temp %in% c(0,1))){
+                stop("Values of SNPs are not all 0, 1, or NA")
+            }
+
+            temp <- .bin2raw(input$snp)
+            x at snp <- temp$snp
+            x at n.loc <- temp$n.loc
+            x at NA.posi <- temp$NA.posi
+            return(x)
+        }
+    }
+
+
+    ## handle n.loc ##
+    if(!is.null(input$n.loc)){
+        x at n.loc <- as.integer(input$n.loc)
+    } else {
+        x at n.loc <- as.integer(length(x at snp)*8)
+    }
+
+
+    ## handle NA.posi ##
+    if(!is.null(input$NA.posi)){
+        x at NA.posi <- as.integer(input$NA.posi)
+    }
+
+    return(x)
+}) # end SNPbin constructor
+
+
+
 ## each byte takes a value on [0,255]
 
 ## function to code multiple SNPs on a byte
-## 7 combinations of SNPs can be coded on a single byte
+## 8 combinations of SNPs can be coded on a single byte
 ## we use bytes values from [1,128]
 ## 200 is a missing value
-SNPCOMB <- expand.grid(rep(list(c(0,1)), 7))
+.bin2raw <- function(vecSnp){
+    ## was required in pure-R version
+    ## SNPCOMB <- as.matrix(expand.grid(rep(list(c(0,1)), 8)))
+    ## colnames(SNPCOMB) <- NULL
 
+    ## handle missing data
+    NAposi <- which(is.na(vecSnp))
+    if(length(NAposi)>0){
+        vecSnp[is.na(vecSnp)] <- 0L
+    }
 
-f1 <- function(vecSnp){
-    nbBytes <- 1+ length(vecSnp) %/% 7
-    out.length <- 7*nbBytes
-    temp <- c(vecSnp, rep(0, out.length-length(vecSnp))) # fill the end with 0 of necessary
-    sapply(seq(1, by=7, length=nbBytes), function(i) which(apply(SNPCOMB,1, function(e) all(vecSnp[i:(i+7)]==e))) )
 
+    nbBytes <- 1+ length(vecSnp) %/% 8
+    ori.length <- length(vecSnp)
+    new.length <- 8*nbBytes
+    vecSnp <- c(vecSnp, rep(0, new.length-ori.length)) # fill the end with 0 of necessary
 
-    as.raw(length(vecSnp))
-}
+
+    ## map info to bytes (0:255)
+    vecSnp <- as.integer(vecSnp)
+    vecRaw <- integer(nbBytes)
+
+    vecRaw <- .C("binIntToBytes", vecSnp, length(vecSnp), vecRaw, nbBytes, PACKAGE="adegenet")[[3]]
+    ## vecraw <- sapply(seq(1, by=8, length=nbBytes), function(i) which(apply(SNPCOMB,1, function(e) all(temp[i:(i+7)]==e))) ) # old R version
+
+    ## code information as raw and add missing data
+    vecRaw <- as.raw(vecRaw)
+    res <- list(snp=vecRaw, n.loc=as.integer(ori.length), NA.posi=as.integer(NAposi))
+    return(res)
+} # end .bin2raw
+
+
+
+
+
+
+
+#############
+## .SNPbin2int
+#############
+## convert SNPbin to integers (0/1)
+.SNPbin2int <- function(x){
+    SNPCOMB <- as.matrix(expand.grid(rep(list(c(0,1)), 8)))
+    colnames(SNPCOMB) <- NULL
+    res <- unlist(lapply(as.integer(x at snp), function(i) SNPCOMB[i+1,]))
+    res <- res[1:x at n.loc]
+    if(length(x at NA.posi)>0){
+        res[x at NA.posi] <- NA
+    }
+    return(res)
+} # end .SNPbin2int
+
+
+
+
+
+
+#############
+## as methods
+#############
+setAs("SNPbin", "integer", def=function(from){
+    res <- .SNPbin2int(from)
+    return(res)
+})
+
+
+
+
+
+
+##############
+## show method
+##############
+setMethod ("show", "SNPbin", function(object){
+
+    cat("\n", )
+
+  cat("\n at call: ")
+  print(x at call)
+}) # end show method
+
+
+
+
+
+
+
+
+
+################################
+## testing :
+##
+## dat <- sample(c(0,1,NA), 1e6, prob=c(.5, .495, .005), replace=TRUE)
+## x <- new("SNPbin", dat)
+
+## identical(as(x, "integer"),dat) # MUST BE TRUE
+
+## object.size(dat)/object.size(x) # EFFICIENCY OF CONVERSION

Added: pkg/src/SNPbin.c
===================================================================
--- pkg/src/SNPbin.c	                        (rev 0)
+++ pkg/src/SNPbin.c	2010-12-22 16:12:52 UTC (rev 742)
@@ -0,0 +1,101 @@
+/*
+  Coded by Thibaut Jombart (tjombart at imperial.ac.uk), December 2010.
+  Distributed with the adephylo package for the R software.
+  Licence: GPL >=2.
+
+  These functions are designed to recode genotypes given as binary integers into new integers which
+  map them to unique bytes. One genotype of 8 binary SNPs is mapped uniquely (bijectively) to a
+  value between 0 and 255. This is achieved by considering the genotype 'x' in the basis 2^0
+  ... 2^7, and summing the values of the vector in this basis. That is, we use the function:
+
+  {0,1}^8 |-> {0,...,255}
+  x -> x_1 * 2^0 + ... + x_8 * 2^7 = \sum_i x_i * 2^(i-1)
+
+*/
+
+
+#include <math.h>
+#include <time.h>
+#include <string.h>
+#include <stdlib.h>
+#include <R.h>
+#include <R_ext/Utils.h>
+#include "adesub.h"
+
+
+/* 
+   ===============================
+   === MAIN EXTERNAL FUNCTIONS ===
+   ===============================
+*/
+
+
+/* 
+   === MAP BINARY SNPS TO 1->256 SCALE ===
+   - vecsnp: vector of integers (0/1)
+   - vesize: length of vecsnp
+   - res: vector of integers valued on 0:255
+   - ressize: length of res
+*/
+void binIntToBytes(int *vecsnp, int *vecsize, int *vecres, int *ressize){
+	/* declarations */
+	int i, j, idres, *binBasis; /* must use dynamic allocation */
+
+	/* allocate memory for local variables */
+	vecintalloc(&binBasis, 8);
+
+	/* define binary basis */
+	for(i=1; i<=8; i++){
+		binBasis[i] = pow(2, i-1);
+	}
+
+	/* set all values of vecres to 0 */
+	for(i=0;i < *ressize;i++){
+		vecres[i] = 0;
+	}
+
+
+
+	/* 
+	   =============
+	   MAIN FUNCTION
+	   ============= 
+	*/
+	
+	/* INDICES */
+	/* i: idx of snp */
+	/* j: idx of binBasis (1:8) */
+	/* idres: idx in vector of results */
+
+	idres = 0;
+	j = 1;
+	for(i=0;i< *vecsize;i++){
+		vecres[idres] = vecres[idres] + binBasis[j] * vecsnp[i];
+		if(j == 8){
+			idres = idres +1;
+			j = 1;
+		} else {
+			j = j+1;
+		}
+	}
+	
+	
+	/* free memory */
+	freeintvec(binBasis);
+
+} /* end sptips */
+
+
+
+
+
+
+
+
+
+
+/* TESTING */
+/*
+
+
+*/



More information about the adegenet-commits mailing list