[adegenet-commits] r742 - in pkg: . R src
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Wed Dec 22 17:12:52 CET 2010
Author: jombart
Date: 2010-12-22 17:12:52 +0100 (Wed, 22 Dec 2010)
New Revision: 742
Added:
pkg/src/SNPbin.c
Modified:
pkg/DESCRIPTION
pkg/R/SNPbin.R
Log:
Begining of the SNPbin class. Bit-storage and C conversion work.
Bloody fast and efficient.
Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION 2010-12-14 15:30:52 UTC (rev 741)
+++ pkg/DESCRIPTION 2010-12-22 16:12:52 UTC (rev 742)
@@ -10,6 +10,6 @@
Suggests: ade4, genetics, hierfstat, spdep, tripack, ape, pegas, graph, RBGL, seqinr
Depends: methods, MASS
Description: Classes and functions for genetic data analysis within the multivariate framework.
-Collate: classes.R basicMethods.R handling.R auxil.R setAs.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R colorplot.R monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R inbreeding.R zzz.R
+Collate: classes.R basicMethods.R handling.R auxil.R setAs.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R colorplot.R monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R inbreeding.R SNPbin.R zzz.R
License: GPL (>=2)
LazyLoad: yes
Modified: pkg/R/SNPbin.R
===================================================================
--- pkg/R/SNPbin.R 2010-12-14 15:30:52 UTC (rev 741)
+++ pkg/R/SNPbin.R 2010-12-22 16:12:52 UTC (rev 742)
@@ -1,18 +1,178 @@
+##
+## NOTE :
+## it is possible to recode each set of 8 binary values on a basis like 'basbin'
+## each genotype of 8 snp is then uniquely mapped to a number on 0:255
+## basbin <- 2^(0:7)
+## all(sapply(apply(SNPCOMB, 1, function(e) basbin[as.logical(e)]), sum) == 0:255)
+##
+##
+##
+
+
+
+####################
+## SNPbin class
+####################
+setClass("SNPbin", representation(snp = "raw",
+ n.loc = "integer",
+ NA.posi = "integer",
+ label = "charOrNULL"),
+ prototype(snp = raw(0), n.loc = integer(0), label = NULL))
+
+
+
+
+
+####################
+## SNPbin constructor
+####################
+setMethod("initialize", "SNPbin", function(.Object, ...) {
+ x <- .Object
+ input <- list(...)
+ if(length(input)==1) names(input) <- "snp"
+
+
+ ## handle snp data ##
+ if(!is.null(input$snp)){
+ if(is.raw(input$snp)){
+ x at snp <-input$snp
+ }
+
+ ## conversion from a vector of 0/1 (integers)
+ if(is.numeric(input$snp) | is.integer(input$snp)){
+ temp <- na.omit(unique(input$snp))
+ if(!all(temp %in% c(0,1))){
+ stop("Values of SNPs are not all 0, 1, or NA")
+ }
+
+ temp <- .bin2raw(input$snp)
+ x at snp <- temp$snp
+ x at n.loc <- temp$n.loc
+ x at NA.posi <- temp$NA.posi
+ return(x)
+ }
+ }
+
+
+ ## handle n.loc ##
+ if(!is.null(input$n.loc)){
+ x at n.loc <- as.integer(input$n.loc)
+ } else {
+ x at n.loc <- as.integer(length(x at snp)*8)
+ }
+
+
+ ## handle NA.posi ##
+ if(!is.null(input$NA.posi)){
+ x at NA.posi <- as.integer(input$NA.posi)
+ }
+
+ return(x)
+}) # end SNPbin constructor
+
+
+
## each byte takes a value on [0,255]
## function to code multiple SNPs on a byte
-## 7 combinations of SNPs can be coded on a single byte
+## 8 combinations of SNPs can be coded on a single byte
## we use bytes values from [1,128]
## 200 is a missing value
-SNPCOMB <- expand.grid(rep(list(c(0,1)), 7))
+.bin2raw <- function(vecSnp){
+ ## was required in pure-R version
+ ## SNPCOMB <- as.matrix(expand.grid(rep(list(c(0,1)), 8)))
+ ## colnames(SNPCOMB) <- NULL
+ ## handle missing data
+ NAposi <- which(is.na(vecSnp))
+ if(length(NAposi)>0){
+ vecSnp[is.na(vecSnp)] <- 0L
+ }
-f1 <- function(vecSnp){
- nbBytes <- 1+ length(vecSnp) %/% 7
- out.length <- 7*nbBytes
- temp <- c(vecSnp, rep(0, out.length-length(vecSnp))) # fill the end with 0 of necessary
- sapply(seq(1, by=7, length=nbBytes), function(i) which(apply(SNPCOMB,1, function(e) all(vecSnp[i:(i+7)]==e))) )
+ nbBytes <- 1+ length(vecSnp) %/% 8
+ ori.length <- length(vecSnp)
+ new.length <- 8*nbBytes
+ vecSnp <- c(vecSnp, rep(0, new.length-ori.length)) # fill the end with 0 of necessary
- as.raw(length(vecSnp))
-}
+
+ ## map info to bytes (0:255)
+ vecSnp <- as.integer(vecSnp)
+ vecRaw <- integer(nbBytes)
+
+ vecRaw <- .C("binIntToBytes", vecSnp, length(vecSnp), vecRaw, nbBytes, PACKAGE="adegenet")[[3]]
+ ## vecraw <- sapply(seq(1, by=8, length=nbBytes), function(i) which(apply(SNPCOMB,1, function(e) all(temp[i:(i+7)]==e))) ) # old R version
+
+ ## code information as raw and add missing data
+ vecRaw <- as.raw(vecRaw)
+ res <- list(snp=vecRaw, n.loc=as.integer(ori.length), NA.posi=as.integer(NAposi))
+ return(res)
+} # end .bin2raw
+
+
+
+
+
+
+
+#############
+## .SNPbin2int
+#############
+## convert SNPbin to integers (0/1)
+.SNPbin2int <- function(x){
+ SNPCOMB <- as.matrix(expand.grid(rep(list(c(0,1)), 8)))
+ colnames(SNPCOMB) <- NULL
+ res <- unlist(lapply(as.integer(x at snp), function(i) SNPCOMB[i+1,]))
+ res <- res[1:x at n.loc]
+ if(length(x at NA.posi)>0){
+ res[x at NA.posi] <- NA
+ }
+ return(res)
+} # end .SNPbin2int
+
+
+
+
+
+
+#############
+## as methods
+#############
+setAs("SNPbin", "integer", def=function(from){
+ res <- .SNPbin2int(from)
+ return(res)
+})
+
+
+
+
+
+
+##############
+## show method
+##############
+setMethod ("show", "SNPbin", function(object){
+
+ cat("\n", )
+
+ cat("\n at call: ")
+ print(x at call)
+}) # end show method
+
+
+
+
+
+
+
+
+
+################################
+## testing :
+##
+## dat <- sample(c(0,1,NA), 1e6, prob=c(.5, .495, .005), replace=TRUE)
+## x <- new("SNPbin", dat)
+
+## identical(as(x, "integer"),dat) # MUST BE TRUE
+
+## object.size(dat)/object.size(x) # EFFICIENCY OF CONVERSION
Added: pkg/src/SNPbin.c
===================================================================
--- pkg/src/SNPbin.c (rev 0)
+++ pkg/src/SNPbin.c 2010-12-22 16:12:52 UTC (rev 742)
@@ -0,0 +1,101 @@
+/*
+ Coded by Thibaut Jombart (tjombart at imperial.ac.uk), December 2010.
+ Distributed with the adephylo package for the R software.
+ Licence: GPL >=2.
+
+ These functions are designed to recode genotypes given as binary integers into new integers which
+ map them to unique bytes. One genotype of 8 binary SNPs is mapped uniquely (bijectively) to a
+ value between 0 and 255. This is achieved by considering the genotype 'x' in the basis 2^0
+ ... 2^7, and summing the values of the vector in this basis. That is, we use the function:
+
+ {0,1}^8 |-> {0,...,255}
+ x -> x_1 * 2^0 + ... + x_8 * 2^7 = \sum_i x_i * 2^(i-1)
+
+*/
+
+
+#include <math.h>
+#include <time.h>
+#include <string.h>
+#include <stdlib.h>
+#include <R.h>
+#include <R_ext/Utils.h>
+#include "adesub.h"
+
+
+/*
+ ===============================
+ === MAIN EXTERNAL FUNCTIONS ===
+ ===============================
+*/
+
+
+/*
+ === MAP BINARY SNPS TO 1->256 SCALE ===
+ - vecsnp: vector of integers (0/1)
+ - vesize: length of vecsnp
+ - res: vector of integers valued on 0:255
+ - ressize: length of res
+*/
+void binIntToBytes(int *vecsnp, int *vecsize, int *vecres, int *ressize){
+ /* declarations */
+ int i, j, idres, *binBasis; /* must use dynamic allocation */
+
+ /* allocate memory for local variables */
+ vecintalloc(&binBasis, 8);
+
+ /* define binary basis */
+ for(i=1; i<=8; i++){
+ binBasis[i] = pow(2, i-1);
+ }
+
+ /* set all values of vecres to 0 */
+ for(i=0;i < *ressize;i++){
+ vecres[i] = 0;
+ }
+
+
+
+ /*
+ =============
+ MAIN FUNCTION
+ =============
+ */
+
+ /* INDICES */
+ /* i: idx of snp */
+ /* j: idx of binBasis (1:8) */
+ /* idres: idx in vector of results */
+
+ idres = 0;
+ j = 1;
+ for(i=0;i< *vecsize;i++){
+ vecres[idres] = vecres[idres] + binBasis[j] * vecsnp[i];
+ if(j == 8){
+ idres = idres +1;
+ j = 1;
+ } else {
+ j = j+1;
+ }
+ }
+
+
+ /* free memory */
+ freeintvec(binBasis);
+
+} /* end sptips */
+
+
+
+
+
+
+
+
+
+
+/* TESTING */
+/*
+
+
+*/
More information about the adegenet-commits
mailing list