[adegenet-commits] r135 - in pkg: . R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Thu Jun 26 18:02:13 CEST 2008
Author: jombart
Date: 2008-06-26 18:02:13 +0200 (Thu, 26 Jun 2008)
New Revision: 135
Modified:
pkg/DESCRIPTION
pkg/R/import.R
pkg/man/propTyped.Rd
Log:
Important modifications concerning df2genind, to adapt to various levels of ploidy. Check blocked so that users cannot try it yet.
Seems to work, needs testing.
Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION 2008-06-26 12:17:15 UTC (rev 134)
+++ pkg/DESCRIPTION 2008-06-26 16:02:13 UTC (rev 135)
@@ -9,4 +9,4 @@
Description: Classes and functions for genetic data analysis within the multivariate framework.
License: GPL (>=2)
LazyLoad: yes
-Collate: classes.R auxil.R genind2genpop.R propTyped.R basicMethods.R old2new.R makefreq.R chooseCN.R dist.genpop.R export.R setAs.R gstat.randtest.R HWE.R import.R monmonier.R coords.monmonier.R spca.R spca.rtests.R zzz.R hybridize.R fstat.R propShared.R scale.R
\ No newline at end of file
+Collate: classes.R auxil.R genind2genpop.R propTyped.R basicMethods.R old2new.R makefreq.R chooseCN.R dist.genpop.R export.R setAs.R gstat.randtest.R HWE.R import.R monmonier.R coords.monmonier.R spca.R spca.rtests.R zzz.R hybridize.R fstat.R propShared.R scale.R REMOVE-ME-TO-PASS-THE-CHECK.R
\ No newline at end of file
Modified: pkg/R/import.R
===================================================================
--- pkg/R/import.R 2008-06-26 12:17:15 UTC (rev 134)
+++ pkg/R/import.R 2008-06-26 16:02:13 UTC (rev 135)
@@ -39,7 +39,7 @@
#####################
# Function df2genind
#####################
-df2genind <- function(X, ncode=NULL, ind.names=NULL, loc.names=NULL, pop=NULL, missing=NA){
+df2genind <- function(X, sep=NULL, ncode=NULL, ind.names=NULL, loc.names=NULL, pop=NULL, missing=NA, ploidy=2){
if(is.data.frame(X)) X <- as.matrix(X)
if (!inherits(X, "matrix")) stop ("X is not a matrix")
@@ -48,10 +48,12 @@
n <- nrow(X)
nloc <- ncol(X)
-
+ ploidy <- as.integer(ploidy)
+ if(ploidy < as.integer(1)) stop("ploidy cannot be less than 1")
+
if(is.null(ind.names)) {ind.names <- rownames(X)}
if(is.null(loc.names)) {loc.names <- colnames(X)}
-
+
## pop optionnelle
if(!is.null(pop)){
if(length(pop)!= n) stop("length of factor pop differs from nrow(X)")
@@ -59,91 +61,151 @@
}
## find or check the number of coding characters, 'ncode'
- if(!is.null(ncode)) {if(ncode < max(nchar(X)) ) stop("some character strings exceed the provided ncode.")}
- if(is.null(ncode)) { ncode <- max(nchar(X)) }
- if((ncode %% 2)>0) stop("Invalid number of coding characters (should be 2, 4, or 6)")
-
-
- ## now check all strings and make sure they all have 'ncode' characters
- ## NA are temporarily coded as "00", "000" or "000000" to fit the check
- keepCheck <- any(nchar(X) < ncode)
- missAll <- paste(rep("0",ncode/2),collapse="")
- missTyp <- paste(rep("0",ncode),collapse="")
- X[is.na(X)] <- missTyp
-
- while(keepCheck){
- mat0 <- matrix("", ncol=ncol(X), nrow=nrow(X))
- mat0[nchar(X) < ncode] <- "0"
- X <- matrix(paste(mat0, X, sep=""), nrow=nrow(mat0))
- keepCheck <- any(nchar(X) < ncode)
+ if(is.null(sep)){
+ if(!is.null(ncode)) {if(ncode < max(nchar(X)) ) stop("some character strings exceed the provided ncode.")}
+ if(is.null(ncode)) { ncode <- max(nchar(X)) }
+ if((ncode %% ploidy)>0) stop(paste(ploidy,"alleles cannot be coded by a total of",
+ ncode,"characters", sep=" "))
}
-
- ## X now only contains valid strings with ncode characters
+ ## ERASE ENTIRELY NON-TYPE LOCI AND INDIVIDUALS
+ tempX <- X
+ if(!is.null(sep)) tempX <- gsub(sep,"",X)
+ tempX <- gsub("^0*$",NA,X)
+
## Erase entierely non-typed loci
- temp <- apply(X,2,function(c) all(c==missTyp))
+ temp <- apply(tempX,2,function(c) all(is.na(c)))
if(any(temp)){
X <- X[,!temp]
loc.names <- loc.names[!temp]
nloc <- ncol(X)
warning("entirely non-type marker(s) deleted")
}
-
+
## Erase entierely non-type individuals
- temp <- apply(X,1,function(c) all(c==missTyp))
+ temp <- apply(tempX,1,function(r) all(is.na(r)))
if(any(temp)){
X <- X[!temp,]
pop <- pop[!temp]
ind.names <- ind.names[!temp]
n <- nrow(X)
- warning("entirely non-type individual(s) deleted")
+ warning("entirely non-type individual(s) deleted")
}
-
+
n <- nrow(X)
# ind.names <- rownames(X) this erases the real labels
# note: if X is kept as a matrix, duplicate row names are no problem
-
- enumallel <- function (x) {
- w <- as.character(x)
- w1 <- substr(w,1,ncode/2)
- w2 <- substr(w,(ncode/2)+1,ncode)
- w3 <- sort(unique (c(w1,w2)))
- return(w3[w3!=missAll])
+
+
+ ## function to fill a matrix of char 'M' with the required
+ ## number of zero, targetN being the total number of char required
+ fillWithZero <- function(M, targetN){
+ naIdx <- is.na(M)
+ keepCheck <- any(nchar(M) < targetN)
+ while(keepCheck){
+ mat0 <- matrix("", ncol=ncol(M), nrow=nrow(M))
+ mat0[nchar(M) < targetN] <- "0"
+ M <- matrix(paste(mat0, M, sep=""), nrow=nrow(mat0))
+ keepCheck <- any(nchar(M) < targetN)
+ }
+
+ ## restore NA (otherwise we're left with "NA")
+ M[naIdx] <- NA
+ return(M)
}
- loc.all <- lapply(1:ncol(X),function(i) enumallel(X[,i]))
+ ## CHECK STRING LENGTH IF NO SEPARATOR PROVIDED
+ if(is.null(sep) | ploidy==as.integer(1)){
+ ## ## now check all strings and make sure they all have 'ncode' characters
+ ## ## NA are temporarily coded as "00", "000" or "000000" to fit the check
+ ## keepCheck <- any(nchar(X) < ncode)
+ ## missAll <- paste(rep("0",ncode/ploidy),collapse="")
+ ## missTyp <- paste(rep("0",ncode),collapse="")
+ ## X[is.na(X)] <- missTyp
+
+ ## while(keepCheck){
+ ## mat0 <- matrix("", ncol=ncol(X), nrow=nrow(X))
+ ## mat0[nchar(X) < ncode] <- "0"
+ ## X <- matrix(paste(mat0, X, sep=""), nrow=nrow(mat0))
+ ## keepCheck <- any(nchar(X) < ncode)
+ ## }
+
+ X <- fillWithZero(X,targetN=ncode)
+
+ ## now split X by allele
+ splitX <- list()
+ for(i in 1:ploidy){
+ splitX[[i]] <- substr(X,1,ncode/ploidy)
+ X <- sub(paste("^.{",ncode/ploidy,"}",sep=""),"",X)
+ }
+
+ } # END CHECK STRING LENGTH WITHOUT SEP
+
+
+ ## CHECK STRING LENGTH WITH SEPARATOR PROVIDED
+ if(!is.null(sep)){
+ if(ploidy > 1){
+ temp <- t(as.matrix(as.data.frame(strsplit(X,sep))))
+ splitX <- list()
+ for(i in 1:ncol(temp)){
+ splitX[[i]] <- matrix(temp[,i], nrow=n)
+ } # each matrix of splitX contains typing for 1 allele
+ } else {
+ splitX <- list()
+ splitX[[1]] <- X
+ }
+
+ ## get the right ncode
+ temp <- unlist(splitX)
+ temp <- temp[!is.na(temp)]
+ ncode <- max(nchar(temp))*ploidy
+ splitX <- lapply(splitX, function(Y) fillWithZero(Y,targetN=ncode/ploidy))
+ } # END CHECK STRING LENGTH WITH SEP
+
+
+ ## AT THIS STAGE, splitX IS A LIST OF MATRICES,
+ ## EACH GIVING TYPING FOR AN ALLELE
+
+ ## fetch all possible alleles per locus
+ loc.all <- list()
+ for(i in 1:nloc){
+ temp <- unlist(lapply(splitX,function(e) e[,i]))
+ loc.all[[i]] <- sort(unique(temp[!is.na(temp)]))
+ }
+
names(loc.all) <- loc.names
- # loc.all est une liste dont chaque element est un vecteur des alleles (ordonnes) d'un marqueur
+ ## loc.all is a list whose element are vectors of sorted possible alleles at a locus
temp <- lapply(1:nloc, function(i) matrix(0,nrow=n,ncol=length(loc.all[[i]]),
dimnames=list(NULL,loc.all[[i]])) )
+
+ names(temp) <- loc.names
# note: keep rownames as NULL in case of duplicates
-
- names(temp) <- loc.names
- # temp est une liste dont chaque element est un tableau-marqueur (indiv x alleles)
+ ## temp is a list whose elements are one matrix (indiv x alleles) for each marker
- # remplissage des tableaux
+ ## now tables in 'temp' are filled up
findall <- function(cha,loc.all){
- if(cha==missAll) return(NULL)
+ if(is.na(cha)) return(NULL)
return(which(cha==loc.all))
}
- for(i in 1:n){
- for(j in 1:nloc){
- all1pos <- findall(substr(X[i,j],1,ncode/2),loc.all[[j]])
- temp[[j]][i,all1pos] <- temp[[j]][i,all1pos] + 0.5
- all2pos <- findall(substr(X[i,j],(ncode/2)+1,ncode),loc.all[[j]])
- temp[[j]][i,all2pos] <- temp[[j]][i,all2pos] + 0.5
- if(is.null(c(all1pos,all2pos))) {temp[[j]][i,] <- NA}
- }
+ for(k in 1:ploidy){
+ for(i in 1:n){
+ for(j in 1:nloc){
+ allIdx <- findall(splitX[[k]][i,j],loc.all[[j]])
+ temp[[j]][i,allIdx] <- temp[[j]][i,allIdx] + 1
+ if(is.null(allIdx)) {temp[[j]][i,] <- NA}
+ }
+ }
}
- # beware: colnames are wrong when there is only one allele in a locus
- # right colnames are first generated
+ ## beware: colnames are wrong when there is only one allele in a locus
+ ## right colnames are first generated
nall <- unlist(lapply(temp,ncol))
loc.rep <- rep(names(nall),nall)
col.lab <- paste(loc.rep,unlist(loc.all,use.names=FALSE),sep=".")
mat <- as.matrix(cbind.data.frame(temp))
+ mat <- mat/ploidy
colnames(mat) <- col.lab
rownames(mat) <- ind.names
@@ -157,7 +219,7 @@
prevcall <- match.call()
- res <- genind( tab=mat, pop=pop, prevcall=prevcall )
+ res <- genind( tab=mat, pop=pop, prevcall=prevcall, ploidy=ploidy )
return(res)
} # end df2genind
Modified: pkg/man/propTyped.Rd
===================================================================
--- pkg/man/propTyped.Rd 2008-06-26 12:17:15 UTC (rev 134)
+++ pkg/man/propTyped.Rd 2008-06-26 16:02:13 UTC (rev 135)
@@ -32,17 +32,15 @@
"loc"), or a matrix of binary data (when \code{by} equals "both")
}
\details{
- The argument \code{method} is used as follows:\cr
-
- - \code{sigma}: scaling is made using the usual standard deviation\cr
-
- - \code{binom}: scaling is made using the theoretical variance of the
- allele frequency. This can be used to avoid that frequencies close to
- 0.5 have a stronger variance that those close to 0 or 1.
+ When \code{by} is set to "both", the result is a matrix of binary data
+ with entities in rows (individuals or populations) and markers in
+ columns. The values of the matrix are 1 for typed data, and 0 for NA.
}
\author{Thibaut Jombart \email{jombart at biomserv.univ-lyon1.fr} }
\examples{
-
+data(nancycats)
+propTyped(nancycats,by="loc")
+propTyped(genind2genpop(nancycats),by="both")
}
\keyword{methods}
\keyword{manip}
More information about the adegenet-commits
mailing list