[Rflptools-commits] r2 - in pkg: . RFLPtools RFLPtools/R RFLPtools/data RFLPtools/inst RFLPtools/inst/doc RFLPtools/inst/extdata RFLPtools/man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Wed Feb 24 19:50:56 CET 2010


Author: stamats
Date: 2010-02-24 19:50:56 +0100 (Wed, 24 Feb 2010)
New Revision: 2

Added:
   pkg/RFLPtools/
   pkg/RFLPtools/DESCRIPTION
   pkg/RFLPtools/NAMESPACE
   pkg/RFLPtools/NEWS
   pkg/RFLPtools/R/
   pkg/RFLPtools/R/RFLPdist.R
   pkg/RFLPtools/R/RFLPdist2.R
   pkg/RFLPtools/R/RFLPdist2ref.R
   pkg/RFLPtools/R/RFLPplot.R
   pkg/RFLPtools/R/RFLPqc.R
   pkg/RFLPtools/R/RFLPrefplot.R
   pkg/RFLPtools/R/read.blast.R
   pkg/RFLPtools/R/read.rflp.R
   pkg/RFLPtools/R/sim2dist.R
   pkg/RFLPtools/R/simMatrix.R
   pkg/RFLPtools/R/write.hclust.R
   pkg/RFLPtools/data/
   pkg/RFLPtools/data/BLASTdata.RData
   pkg/RFLPtools/data/RFLPdata.RData
   pkg/RFLPtools/data/RFLPref.RData
   pkg/RFLPtools/inst/
   pkg/RFLPtools/inst/CITATION
   pkg/RFLPtools/inst/doc/
   pkg/RFLPtools/inst/doc/RFLPtools.Rnw
   pkg/RFLPtools/inst/doc/RFLPtools.pdf
   pkg/RFLPtools/inst/doc/logoUBT.png
   pkg/RFLPtools/inst/extdata/
   pkg/RFLPtools/inst/extdata/AZ091016_report.txt
   pkg/RFLPtools/inst/extdata/BLASTexample.txt
   pkg/RFLPtools/inst/extdata/RFLPexample.txt
   pkg/RFLPtools/man/
   pkg/RFLPtools/man/0RFLP-package.Rd
   pkg/RFLPtools/man/BLASTdata.Rd
   pkg/RFLPtools/man/RFLPdata.Rd
   pkg/RFLPtools/man/RFLPdist.Rd
   pkg/RFLPtools/man/RFLPdist2.Rd
   pkg/RFLPtools/man/RFLPdist2ref.Rd
   pkg/RFLPtools/man/RFLPplot.Rd
   pkg/RFLPtools/man/RFLPqc.Rd
   pkg/RFLPtools/man/RFLPref.Rd
   pkg/RFLPtools/man/RFLPrefplot.Rd
   pkg/RFLPtools/man/nrBands.Rd
   pkg/RFLPtools/man/read.blast.Rd
   pkg/RFLPtools/man/read.rflp.Rd
   pkg/RFLPtools/man/sim2dist.Rd
   pkg/RFLPtools/man/simMatrix.Rd
   pkg/RFLPtools/man/write.hclust.Rd
Log:
uploaded version 1.1 of RFLPtools

Added: pkg/RFLPtools/DESCRIPTION
===================================================================
--- pkg/RFLPtools/DESCRIPTION	                        (rev 0)
+++ pkg/RFLPtools/DESCRIPTION	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,13 @@
+Package: RFLPtools
+Type: Package
+Title: Tools to analyse RFLP data
+Version: 1.1
+Date: 2010-02-21
+Author: Fabienne Flessa, Alexandra Kehl, Matthias Kohl
+Maintainer: Matthias Kohl <Matthias.Kohl at stamats.de>
+Description: RFLPtools provides functions to analyse DNA fragment samples 
+  (i.e. derived from RFLP-analysis) and standalone BLAST report files 
+  (i.e. DNA sequence analysis).
+Depends: R(>= 2.9.0), stats, utils, graphics, grDevices, RColorBrewer
+Suggests: lattice, MKmisc
+License: LGPL-3


Property changes on: pkg/RFLPtools/DESCRIPTION
___________________________________________________________________
Name: svn:executable
   + *

Added: pkg/RFLPtools/NAMESPACE
===================================================================
--- pkg/RFLPtools/NAMESPACE	                        (rev 0)
+++ pkg/RFLPtools/NAMESPACE	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,12 @@
+export(read.rflp,
+       RFLPqc,
+       RFLPdist,
+       RFLPdist2,
+       RFLPplot,
+       RFLPdist2ref,
+       RFLPrefplot,
+       nrBands,
+       read.blast,
+       simMatrix,
+       sim2dist,
+       write.hclust)


Property changes on: pkg/RFLPtools/NAMESPACE
___________________________________________________________________
Name: svn:executable
   + *

Added: pkg/RFLPtools/NEWS
===================================================================
--- pkg/RFLPtools/NEWS	                        (rev 0)
+++ pkg/RFLPtools/NEWS	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,18 @@
+###############################################################################
+## NEWS to package RFLPtools
+###############################################################################
+
+###########################################################
+## Version 1.0
+###########################################################
+- start of implementation, first version on CRAN
+
+###########################################################
+## Version 1.1
+###########################################################
+- added NEWS file
+- some examples simplyfied to reduce check-time
+- added warnings for BLAST datasets for the case that there are subject.ids 
+  which do not occur as query.ids
+- extended vignette and package Rd-file by RFLPplot and RFLPrefplot examples
+

Added: pkg/RFLPtools/R/RFLPdist.R
===================================================================
--- pkg/RFLPtools/R/RFLPdist.R	                        (rev 0)
+++ pkg/RFLPtools/R/RFLPdist.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,32 @@
+###############################################################################
+## Computation of distances for RFLP data
+###############################################################################
+
+## x: data.frame with RFLP data
+## distfun: function to compute distance (cf. ?dist)
+RFLPdist <- function(x, distfun = dist, nrBands){
+    stopifnot(is.data.frame(x))
+    stopifnot(is.function(dist))
+    x1 <- split(x, x$Sample)
+    nrbands <- sort(unique(sapply(x1, nrow)))
+    x1.bands <- sapply(x1, nrow)
+
+    if(missing(nrBands)){
+        res <- vector("list", length(nrbands))
+        index <- 0
+        for(i in nrbands){
+            index <- index + 1
+            temp <- do.call("rbind", x1[x1.bands == i])
+            temp1 <- split(temp[,"MW"], factor(temp[,"Sample"]))
+            res[[index]] <-  distfun(do.call("rbind", temp1))
+        }
+        names(res) <- nrbands
+    }else{
+        if(!(nrBands %in% nrbands))
+            stop("No samples with given number of bands!")
+        temp <- do.call("rbind", x1[x1.bands == nrBands])
+        temp1 <- split(temp[,"MW"], factor(temp[,"Sample"]))
+        res <-  distfun(do.call("rbind", temp1))
+    }
+    res
+}

Added: pkg/RFLPtools/R/RFLPdist2.R
===================================================================
--- pkg/RFLPtools/R/RFLPdist2.R	                        (rev 0)
+++ pkg/RFLPtools/R/RFLPdist2.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,57 @@
+###############################################################################
+## Computation of distances for RFLP data - alternative approach
+###############################################################################
+
+## compute number of bands
+nrBands <- function(x){
+    sort(unique(sapply(split(x, x$Sample), nrow)))
+}
+
+## x: data.frame with RFLP data
+## distfun: function to compute distance (e.g. ?dist)
+## nrBands: number of bands 
+## nrMissing: number of bands which may be nrMissing
+## diag: see ?dist
+## upper: see ?dist
+## compares samples with number of bands in: nrBands, nrBands + 1, ..., nrBands + nrMissing
+RFLPdist2 <- function(x, distfun = dist, nrBands, nrMissing, diag = FALSE, upper = FALSE){
+    stopifnot(is.data.frame(x))
+    x1 <- split(x, x$Sample)
+    nrbands <- sort(unique(sapply(x1, nrow)))
+    x1.bands <- sapply(x1, nrow)
+
+    temp <- do.call("rbind", x1[x1.bands %in% c(nrBands:(nrBands+nrMissing))])
+    temp1 <- split(temp[,"MW"], factor(temp[,"Sample"]))
+    N <- length(temp1)
+    temp2 <- matrix(NA, ncol = nrBands + nrMissing, nrow = N)
+    rownames(temp2) <- names(temp1)
+    for(i in 1:N){
+        temp2[i,1:length(temp1[[i]])] <- temp1[[i]]
+    }
+    d <- matrix(NA, nrow = N, ncol = N)
+    dfun <- function(x, y){
+        m <- sum(!is.na(x))
+        min(as.matrix(dist(rbind(x[1:m], t(combn(y, m)))))[-1,1])
+    }
+    for(i in 1:N){
+        for(j in 1:i){
+            if(sum(!is.na(temp2[i,])) == sum(!is.na(temp2[j,]))){
+                m <- sum(!is.na(temp2[i,]))
+                d[i,j] <- as.vector(distfun(rbind(temp2[i,1:m], temp2[j,1:m])))
+            }
+            if(sum(!is.na(temp2[i,])) > sum(!is.na(temp2[j,])))
+                d[i,j] <- dfun(temp2[j,], temp2[i,])
+            if(sum(!is.na(temp2[i,])) < sum(!is.na(temp2[j,])))
+                d[i,j] <- dfun(temp2[i,], temp2[j,])
+        }
+    }
+    d <- d[lower.tri(d)]
+    attr(d, "Size") <- N
+    attr(d, "Labels") <- dimnames(temp2)[[1]]
+    attr(d, "Diag") <- diag
+    attr(d, "Upper") <- upper
+    attr(d, "method") <- distfun
+    attr(d, "call") <- match.call()
+    class(d) <- "dist"
+    return(d)
+}

Added: pkg/RFLPtools/R/RFLPdist2ref.R
===================================================================
--- pkg/RFLPtools/R/RFLPdist2ref.R	                        (rev 0)
+++ pkg/RFLPtools/R/RFLPdist2ref.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,39 @@
+###############################################################################
+## Computation of distances for RFLP data
+###############################################################################
+
+## x: data.frame with RFLP data
+## ref: data.frame with RFLP reference data
+## distfun: function to compute distance (cf. ?dist)
+RFLPdist2ref <- function(x, ref, distfun = dist, nrBands){
+    stopifnot(is.data.frame(x))
+    stopifnot(is.data.frame(ref))
+    stopifnot(is.function(dist))
+    
+    if(missing(nrBands))
+        stop("Number of Bands 'nrBands' is missing.")
+        
+    x1 <- split(x, x$Sample)
+    ref1 <- split(ref, ref$Sample)
+    nrbands <- sort(unique(sapply(x1, nrow)))
+    ref.nrbands <- sort(unique(sapply(ref1, nrow)))
+    
+    if(!(nrBands %in% nrbands))
+        stop("There is no sample with specified number of bands 'nrBands'.")
+        
+    if(!(nrBands %in% ref.nrbands))
+        stop("There is no reference sample with specified number of bands 'nrBands'.")
+     
+    x1.bands <- sapply(x1, nrow)
+    ref1.bands <- sapply(ref1, nrow)
+
+    temp <- do.call("rbind", x1[x1.bands == nrBands])
+    ref.temp <- do.call("rbind", ref1[ref1.bands == nrBands])
+
+    temp1 <- split(temp[,"MW"], factor(temp[,"Sample"]))
+    grp <- factor(paste(ref.temp[,"Taxonname"], " (", ref.temp[,"Accession"], ")", sep = ""))
+    ref.temp1 <- split(ref.temp[,"MW"], grp)
+
+    res <- as.matrix(distfun(do.call("rbind", c(ref.temp1, temp1))))
+    res[(length(ref.temp1)+1):nrow(res), 1:length(ref.temp1), drop = FALSE]
+}

Added: pkg/RFLPtools/R/RFLPplot.R
===================================================================
--- pkg/RFLPtools/R/RFLPplot.R	                        (rev 0)
+++ pkg/RFLPtools/R/RFLPplot.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,73 @@
+###############################################################################
+## Computation of distances for RFLP data
+###############################################################################
+
+## x: data.frame with RFLP data
+## distfun: function to compute distance (cf. ?dist)
+RFLPplot <- function(x, nrBands, nrMissing, distfun = dist, hclust.method = "complete", 
+                     mar.bottom = 5, cex.axis = 0.5){
+    stopifnot(is.data.frame(x))
+    stopifnot(is.function(dist))
+    if(missing(nrBands))
+        stop("Number of Bands 'nrBands' is missing.")
+        
+    x1 <- split(x, x$Sample)
+    nrbands <- sort(unique(sapply(x1, nrow)))
+    x1.bands <- sapply(x1, nrow)
+
+    if(missing(nrMissing)){
+        ind <- x1.bands == nrBands
+        dend.ord <- order.dendrogram(as.dendrogram(hclust(RFLPdist(x, distfun = distfun, 
+                                                                   nrBands = nrBands), 
+                                                          method = hclust.method)))
+    }else{
+        ind <- x1.bands %in% c(nrBands:(nrBands+nrMissing))
+        dend.ord <- order.dendrogram(as.dendrogram(hclust(RFLPdist2(x, distfun = distfun, 
+                                                                   nrBands = nrBands,
+                                                                   nrMissing = nrMissing), 
+                                                          method = hclust.method)))
+    }
+    temp <- x1[ind]
+    temp1 <- do.call("rbind", temp)
+    
+    par(mar = c(mar.bottom, 4, 4, 2) + 0.1)
+    lo <- 10*trunc(min(temp1$MW)/10)
+    up <- 10*ceiling(max(temp1$MW)/10)
+    if(up-lo < 1) up <- up + 1
+    
+
+    plot(NA, NA, xlim = c(0.25,length(temp)+0.75), ylim = c(lo, up),
+         xlab = "", ylab = "molecular weight", axes = FALSE)
+    axis(1, at = 1:length(temp), labels = names(temp)[dend.ord], las = 2, cex.axis = cex.axis)
+        
+    At <- round(seq(lo, up, length = 11), 0)
+    axis(2, at = At, labels = as.character(At), las = 2)
+    abline(h = At, col = "grey", lty = 2)
+    box()
+    if(missing(nrMissing)){
+        title(paste("Samples with", nrBands, "bands"))
+    }else{
+        if(nrMissing == 1){
+            title(paste("Samples with", nrBands, "or", nrBands+1, "bands"))
+        }else{
+            title(paste("Samples with", nrBands, ", ..., ", nrBands+nrMissing, "bands"))
+        }
+    }
+    rm(temp1)
+    if(length(temp) <= 9){
+        mycol <- brewer.pal(max(3, length(temp)), "Set1")
+    }else{
+        mycol1 <- brewer.pal(9, "Set1")
+        mycol <- colorRampPalette(mycol1)(length(temp))
+    }
+
+    temp.o <- temp[dend.ord]
+    for(i in seq_along(temp.o)){
+        temp.mw <- temp.o[[i]]$MW
+        reps <- length(temp.mw)
+        matlines(rbind(rep(i-0.25, reps), rep(i+0.25, reps)), 
+                 rbind(temp.mw, temp.mw), 
+                 lwd = 3, lty = 1, col = mycol[i])
+    }
+    invisible()
+}

Added: pkg/RFLPtools/R/RFLPqc.R
===================================================================
--- pkg/RFLPtools/R/RFLPqc.R	                        (rev 0)
+++ pkg/RFLPtools/R/RFLPqc.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,41 @@
+###############################################################################
+## Quality control of RFLP data
+###############################################################################
+
+RFLPqc <- function(x, rm.band1 = TRUE, QC.lo = 0.8, QC.up = 1.07, QC.rm = FALSE){
+    if(length(QC.lo) > 1){
+        warning("Only first element of 'QC.lo' is used.")
+        QC.lo <- QC.lo[1]
+    }
+    if(length(QC.up) > 1){
+        warning("Only first element of 'QC.up' is used.")
+        QC.up <- QC.up[1]
+    }
+    if(QC.lo <= 0 || QC.lo >= 1)
+        stop("'QC.lo' has to be in (0,1).")
+    if(QC.up <= 1)
+        stop("'QC.up' has to be larger than 1.")        
+    temp <- split(x$MW, x$Sample)
+    if(QC.rm) rm.samp <- NULL
+    for(i in seq_along(temp)){
+        tempi <- temp[[i]]
+        if((sum(tempi[-1]) < QC.lo*tempi[1]) || (sum(tempi[-1]) > QC.up*tempi[1])){
+            cat("Sum of bands of sample ", names(temp)[i], " out of range (", 
+                round(sum(tempi[-1])/tempi[1]*100, 2), "%)!", sep = "", fill = TRUE)
+            if(QC.rm){
+                rm.samp <- c(rm.samp, names(temp)[i])
+            }
+        }
+    }
+    
+    if(rm.band1){ 
+        x <- x[x$Band != 1, ]
+        x$Band <- x$Band - 1
+    }
+    
+    if(QC.rm) x <- x[!(x$Sample %in% rm.samp), ]
+    
+    if(rm.band1 || QC.rm) rownames(x) <- 1:nrow(x)
+    
+    return(x)
+}

Added: pkg/RFLPtools/R/RFLPrefplot.R
===================================================================
--- pkg/RFLPtools/R/RFLPrefplot.R	                        (rev 0)
+++ pkg/RFLPtools/R/RFLPrefplot.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,71 @@
+###############################################################################
+## Computation of distances for RFLP data
+###############################################################################
+
+## x: data.frame with RFLP data
+## distfun: function to compute distance (cf. ?dist)
+RFLPrefplot <- function(x, ref, distfun = dist, nrBands, mar.bottom = 5, 
+                        cex.main = 1.2, cex.axis = 0.5, devNew = FALSE){
+    res <- RFLPdist2ref(x, ref, distfun, nrBands)
+        
+    x1 <- split(x, x$Sample)
+    temp <- x1[names(x1) %in% rownames(res)]
+
+    ref1 <- split(ref, ref$Sample)
+    grp <- sapply(ref1, function(x) paste(x[1,"Taxonname"], " (", x[1,"Accession"], ")", sep = ""))
+    ref.temp <- ref1[grp %in% colnames(res)]
+    
+    temp1 <- do.call("rbind", temp)
+    ref.temp1 <- do.call("rbind", ref.temp)
+    
+    par(mar = c(mar.bottom, 4, 4, 2) + 0.1)
+    lo <- 10*trunc(min(temp1$MW, ref.temp1$MW)/10)
+    up <- 10*ceiling(max(temp1$MW, ref.temp1$MW)/10)
+    if(up-lo < 1) up <- up + 1
+    At <- round(seq(lo, up, length = 11), 0)
+    
+    grp1 <- paste(ref.temp1[, "Taxonname"], " (", ref.temp1[, "Accession"], ")", sep = "")
+    if(devNew){
+        par(mar = c(mar.bottom, 4, 4, 2) + 0.1)
+    }else{
+        par(mfrow = c(1,ncol(res)), mar = c(mar.bottom, 4, 4, 2) + 0.1)
+    }
+    for(i in 1:ncol(res)){
+        if(devNew && i > 1){ 
+            dev.new()
+            par(mar = c(mar.bottom, 4, 4, 2) + 0.1)
+        }
+        temp.mw <- ref.temp1$MW[grp1 == colnames(res)[i]]
+        reps <- length(temp.mw)
+        samp.o <- order(res[,i])
+        plot(NA, NA, xlim = c(0.25,length(temp)+2.75), ylim = c(lo, up),
+            xlab = "", ylab = "molecular weight", axes = FALSE)
+        axis(1, at = c(1,3:(length(temp)+2)), labels = c("Reference", rownames(res)[samp.o]), 
+             las = 2, cex.axis = cex.axis)
+        axis(2, at = At, labels = as.character(At), las = 2)
+        abline(h = At, col = "grey", lty = 2)
+        abline(h = temp.mw, col = "black", lty = 1)
+        box()
+        title(paste("Reference sample:", colnames(res)[i]), cex.main = cex.main)
+        if(length(temp) <= 9){
+            mycol <- brewer.pal(max(3, length(temp)), "Set1")
+        }else{
+            mycol1 <- brewer.pal(9, "Set1")
+            mycol <- colorRampPalette(mycol1)(length(temp))
+        }
+
+        matlines(rbind(rep(0.75, reps), rep(1.25, reps)), 
+                 rbind(temp.mw, temp.mw), 
+                 lwd = 3, lty = 1, col = "black")
+                    
+        temp.o <- temp[samp.o]
+        for(i in seq_along(temp.o)){
+            temp.mw <- temp.o[[i]]$MW
+            reps <- length(temp.mw)
+            matlines(rbind(rep(i+2-0.25, reps), rep(i+2+0.25, reps)), 
+                    rbind(temp.mw, temp.mw), 
+                    lwd = 3, lty = 1, col = mycol[i])
+        }
+    }
+    invisible()
+}

Added: pkg/RFLPtools/R/read.blast.R
===================================================================
--- pkg/RFLPtools/R/read.blast.R	                        (rev 0)
+++ pkg/RFLPtools/R/read.blast.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,18 @@
+###############################################################################
+## Read BLAST data
+###############################################################################
+
+read.blast <- function(file){
+    x <- read.delim(file = file,
+                        header = FALSE, stringsAsFactors = FALSE)
+    if(ncol(x) != 12)
+        stop("Data in given file", basename(file), "has wrong dimension!")
+    names(x) <- c("query.id", "subject.id", "identity", "alignment.length",
+                  "missmatches", "gap.opens", "q.start", "q.end", "s.start",
+                  "s.end", "evalue", "bit.score")
+    if(!all(sid <- x[,"subject.id"] %in% x[,"query.id"]))
+        warning("The following 'subject.id's do not occur as 'query.id's: ",
+                paste(x[!sid,"subject.id"], collapse = ", "),
+                "\nThis may lead to problems in subsequent analyses!")
+    x
+}

Added: pkg/RFLPtools/R/read.rflp.R
===================================================================
--- pkg/RFLPtools/R/read.rflp.R	                        (rev 0)
+++ pkg/RFLPtools/R/read.rflp.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,21 @@
+###############################################################################
+## Read RFLP data
+###############################################################################
+
+read.rflp <- function(file){
+    x <- read.delim(file = file, stringsAsFactors = FALSE)
+    if(!ncol(x) %in% c(3, 4))
+        stop("Data in given file", basename(file), "has wrong dimension!")
+    
+    if(ncol(x) == 3){
+        Gel <- sapply(x$Sample, function(x) tail(strsplit(x, "\\_")[[1]], n = 1))
+        fun <- function(x){ 
+            temp <- strsplit(x, "\\_")[[1]]
+            paste(temp[-length(temp)], collapse = "_")
+        }
+        Sample <- sapply(x$Sample, fun)
+        x$Sample <- Sample
+        x <- data.frame(x, Gel, stringsAsFactors = FALSE)
+    }
+    x
+}

Added: pkg/RFLPtools/R/sim2dist.R
===================================================================
--- pkg/RFLPtools/R/sim2dist.R	                        (rev 0)
+++ pkg/RFLPtools/R/sim2dist.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,10 @@
+###############################################################################
+## convert similarity matrix to dist object
+###############################################################################
+sim2dist <- function(x, maxSim = 1){
+    stopifnot(is.matrix(x))
+    stopifnot(isSymmetric(x))
+    stopifnot(maxSim >= max(x))
+    d <- maxSim - x # from similarity to distance
+    return(as.dist(d))
+}

Added: pkg/RFLPtools/R/simMatrix.R
===================================================================
--- pkg/RFLPtools/R/simMatrix.R	                        (rev 0)
+++ pkg/RFLPtools/R/simMatrix.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,122 @@
+###############################################################################
+## Compute similarity matrix for BLAST data
+###############################################################################
+
+## x: local blast output
+## sequence.range: logical: w/o sequence range
+## Min: minimum used in case of sequence.range
+## Max: maximum used in case of sequence.range
+simMatrix <- function(x, sequence.range = FALSE, Min, Max){
+    stopifnot(is.data.frame(x))
+    vars <- c("query.id", "subject.id", "alignment.length", "identity")
+    if(!all(vars %in% names(x))){
+        stop("Variables", vars[which(!(vars %in% names(x)))], "are not present in 'x'!")
+    }
+    if(!all(sid <- x[,"subject.id"] %in% x[,"query.id"])){
+        warning("The following 'subject.id's do not occur as 'query.id's: ",
+                paste(x[!sid,"subject.id"], collapse = ", "),
+                "\nThese IDs have been removed before computing the similarity matrix!")
+        x <- x[sid,]
+    }
+    ind <- x[,"query.id"] == x[,"subject.id"]
+
+    x1 <- x[ind,]
+    x2 <- split(x1, factor(x1[,"query.id"]))
+    rm(x1)
+    gc(verbose = FALSE)
+    
+    fun1 <- function(x){
+        ind <- which.max(x[,"alignment.length"])
+        x[ind,]
+    }
+    x3 <- do.call(rbind, lapply(x2, fun1))
+    rm(x2)
+    gc(verbose = FALSE)
+    
+    if(sequence.range){
+        if(missing(Min) & missing(Max))
+            stop("'Min' and 'Max' are missing. At least one has to be specified!")
+        len <- x3[,"alignment.length"]
+        if(!missing(Min)){
+            if(Min > max(len)) 
+                stop("'Min' is larger than maximum sequence length!")
+        }else{
+            Min <- min(len)
+        }
+        if(!missing(Max)){
+            if(Max < min(len)) 
+                stop("'Max' is smaller than maximum sequence length!")
+        }else{
+            Max <- max(len)
+        }
+        if(Min >= Max)
+            stop("'Min' >= 'Max'!")
+        ind.range <- (len >= Min) & (len <= Max)
+        x31 <- x3[ind.range,]
+        sequenzen <- x31[,"query.id"]
+        x32 <- x[x[,"query.id"] %in% sequenzen,]
+        x33 <- x32[x32[,"subject.id"] %in% sequenzen,]
+        x4 <- rbind(x33, x31)
+        rm(x31, x32)
+        gc(verbose = FALSE)
+    }else{
+        x4 <- rbind(x[!ind,], x3)
+    }
+
+    mB <- round(x4[,"identity"]*x4[,"alignment.length"]/100, 0)
+    x5 <- data.frame(x4, mB)
+    rm(x4)
+    gc(verbose = FALSE)    
+
+    x6 <- split(x5, factor(x5[,"query.id"]))
+    rm(x5)
+    gc(verbose = FALSE)
+    fun2 <- function(x, y){
+        ind <- y[,"query.id"] == x[1,"query.id"]
+        S <- x$mB/y[ind,"alignment.length"]
+        cbind(x, S)
+    }
+    x7 <- do.call(rbind, lapply(x6, fun2, y = x3))
+    rm(x3, x6)
+    gc(verbose = FALSE)
+
+    x8 <- split(x7, factor(x7[,"query.id"]))
+    fun3 <- function(x, ids){
+        y <- split(x, factor(x[,"subject.id"]))
+        f <- function(x){
+            ind <- which.max(x[,"S"])
+            x[ind,]
+        }
+        res <- do.call(rbind, lapply(y, f))
+        mis <- ids[!(ids %in% res[,"subject.id"])]
+        if(length(mis) != 0){
+            M <- matrix(0, ncol = ncol(res), nrow = length(mis))
+            colnames(M) <- names(res)
+            M <- as.data.frame(M)
+            M[,"query.id"] <- res[1,"query.id"]
+            M[,"subject.id"] <- mis
+            return(rbind(res, M))
+        }else{
+            return(res)
+        }
+    }
+    if(sequence.range){
+        ids <- unique(x33[,"query.id"])
+	rm(x33)
+    }else{
+        ids <- unique(x[,"query.id"])
+    }
+    x9 <- do.call(rbind, lapply(x8, fun3, ids = ids))
+    rm(x8)
+    gc(verbose = FALSE)
+
+    x10 <- x9[order(x9[,"query.id"], x9[,"subject.id"]),]
+    rm(x9)
+    gc(verbose = FALSE)
+    
+    Sim <- matrix(x10[,"S"], ncol = length(ids), byrow = TRUE)
+    Sim <- pmax(Sim, t(Sim))
+    colnames(Sim) <- sort(ids)
+    rownames(Sim) <- sort(ids)
+    return(Sim)
+}

Added: pkg/RFLPtools/R/write.hclust.R
===================================================================
--- pkg/RFLPtools/R/write.hclust.R	                        (rev 0)
+++ pkg/RFLPtools/R/write.hclust.R	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,17 @@
+write.hclust <- function(x, file, prefix, h = NULL, k = NULL, append = FALSE,
+                         dec = ","){
+    grp <- cutree(x, h=h, k=k)
+    grp.lab <- formatC(grp, digits = 0, width = max(nchar(as.character(grp))), 
+                       format = "f", flag = "0")
+    res <- data.frame("Sample" = names(grp),
+                      "Cluster" = grp, 
+                      "Cluster ID" = paste(prefix, "_H", h, "_", grp.lab, sep = ""),
+                      check.names = FALSE)
+    if(append){
+        write.table(res, file = file, append = append, dec = dec, 
+                    sep ="\t", row.names = FALSE, col.names = FALSE)
+    }else{
+        write.table(res, file = file, append = append, dec = dec, 
+                    sep ="\t", row.names = FALSE)
+    }
+}

Added: pkg/RFLPtools/data/BLASTdata.RData
===================================================================
(Binary files differ)


Property changes on: pkg/RFLPtools/data/BLASTdata.RData
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: pkg/RFLPtools/data/RFLPdata.RData
===================================================================
(Binary files differ)


Property changes on: pkg/RFLPtools/data/RFLPdata.RData
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: pkg/RFLPtools/data/RFLPref.RData
===================================================================
(Binary files differ)


Property changes on: pkg/RFLPtools/data/RFLPref.RData
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: pkg/RFLPtools/inst/CITATION
===================================================================
--- pkg/RFLPtools/inst/CITATION	                        (rev 0)
+++ pkg/RFLPtools/inst/CITATION	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,19 @@
+if(!exists("meta") || is.null(meta)) meta <- packageDescription("RFLPtools")
+year <- sub("-.*", "", meta$Date)
+note <- sprintf("R package version %s", meta$Version)
+
+citHeader("To cite package RFLPtools in publications use:")
+
+citEntry(entry="Manual",
+         title = "RFLPtools: Tools to analyse RFLP data",
+         author = personList(as.person("F. Flessa"), 
+                             as.person("A. Kehl"), 
+                             as.person("M. Kohl")),
+         language = "English",
+         year = year,
+         note = note,
+         type = "R package",
+         textVersion = paste("Flessa, F., Kehl, A., and Kohl, M.",
+                             sprintf("(%s).", year),
+                             "RFLPtools: Tools to analyse RFLP data.",
+                             paste(note, ".", sep = "")))


Property changes on: pkg/RFLPtools/inst/CITATION
___________________________________________________________________
Name: svn:executable
   + *

Added: pkg/RFLPtools/inst/doc/RFLPtools.Rnw
===================================================================
--- pkg/RFLPtools/inst/doc/RFLPtools.Rnw	                        (rev 0)
+++ pkg/RFLPtools/inst/doc/RFLPtools.Rnw	2010-02-24 18:50:56 UTC (rev 2)
@@ -0,0 +1,246 @@
+%\VignetteIndexEntry{RFLPtools}
+%\VignetteDepends{stats, lattice, MKmisc, RColorBrewer}
+%\VignetteKeywords{RFLPtools}
+%\VignettePackage{RFLPtools}
+%
+\documentclass[11pt]{article}
+\usepackage{geometry}\usepackage{color}
+\definecolor{darkblue}{rgb}{0.0,0.0,0.75}
+\usepackage[%
+pdftitle={RFLPtools},%
+pdfauthor={F. Flessa, A. Kehl and M. Kohl},%
+pdfsubject={RFLPtools},%
+pdfkeywords={RFLPtools, RFLP, BLAST},%
+pagebackref,bookmarks,colorlinks,linkcolor=darkblue,citecolor=darkblue,%
+pagecolor=darkblue,raiselinks,plainpages,pdftex]{hyperref}
+%
+\markboth{\sl Package ``{\tt RFLPtools}''}{\sl Package ``{\tt RFLPtools}''}
+%
+% -------------------------------------------------------------------------------
+\newcommand{\code}[1]{{\tt #1}}
+\newcommand{\pkg}[1]{{\tt "#1"}}
+\newcommand{\myinfig}[2]{%
+%  \begin{figure}[htbp]
+    \begin{center}
+      \includegraphics[width = #1\textwidth]{#2}
+%      \caption{\label{#1}#3}
+    \end{center}
+%  \end{figure}
+}
+% -------------------------------------------------------------------------------
+%
+% -------------------------------------------------------------------------------
+\begin{document}
+\SweaveOpts{keep.source = TRUE, eval = TRUE, include = FALSE}
+%-------------------------------------------------------------------------------
+\title{RFLPtools: Analysis of DNA fragment samples and standalone BLAST report files}
+%-------------------------------------------------------------------------------
+\author{F. Flessa, A. Kehl and M. Kohl\\ \\
+\includegraphics[width = 3cm]{logoUBT.png}
+}
+\maketitle
+\tableofcontents
+%-------------------------------------------------------------------------------
+\section{Introduction}
+%-------------------------------------------------------------------------------
+The package \pkg{RFLPtools} aims at 
+\begin{itemize}
+\item the detection of similar band patterns based on DNA fingerprint fragment 
+sizes (i.e. derived from RFLP-analysis)
+\item the analysis of standalone BLAST report files (i.e. DNA sequence analysis)
+\end{itemize}
+In this short vignette we describe and demonstrate the available functions.
+<<load>>=
+library(RFLPtools)
+@
+%-------------------------------------------------------------------------------
+\section{RFLP data}
+%-------------------------------------------------------------------------------
+We load example data and compute the Euclidean distance ...
+<<eucl>>=
+data(RFLPdata)
+res <- RFLPdist(RFLPdata)
+names(res) ## number of bands
+str(res$"6")
+@
+Of course, we can also use other well-known distances ...
+<<other>>=
+res1 <- RFLPdist(RFLPdata, distfun = function(x) dist(x, method = "manhattan"))
+res2 <- RFLPdist(RFLPdata, distfun = function(x) dist(x, method = "maximum"))
+str(res[[1]])
+str(res1[[1]])
+str(res2[[1]])
+@
+Correlation distances
+<<cor>>=
+library(MKmisc)
+res3 <- RFLPdist(RFLPdata, distfun = corDist)
+str(res3$"9")
+@
+As we obtain a list of \code{dist} objects we can easily perform hierarchical 
+clustering ...
+<<hclust, fig = TRUE>>=
+par(mfrow = c(2,2))
+plot(hclust(res[[1]]), main = "Euclidean distance")
+plot(hclust(res1[[1]]), main = "Manhattan distance")
+plot(hclust(res2[[1]]), main = "Maximum distance")
+plot(hclust(res3[[1]]), main = "Pearson correlation distance")
+@
+\myinfig{1}{RFLPtools-hclust.pdf}
+We easily can apply other functions ...
+<<cutree>>=
+clust4bd <- hclust(res[[2]])
+cgroups50 <- cutree(clust4bd, h=50)
+cgroups50
+@
+Another possibility to display the similarity of the samples are so-called 
+(dis-)similarity matrices ...
+<<sim1, fig = TRUE>>=
+library(RColorBrewer)
+library(MKmisc)
+myCol <- colorRampPalette(brewer.pal(8, "RdYlGn"))(128)
+ord <- order.dendrogram(as.dendrogram(hclust(res[[1]])))
+temp <- as.matrix(res[[1]])
+corPlot(temp[ord,ord], col = rev(myCol), minCor = 0, 
+        labels = colnames(temp), title = "(Dis-)Similarity Plot")
+@
+\myinfig{1}{RFLPtools-sim1.pdf}
+<<sim2, fig = TRUE>>=
+library(lattice)
+print(levelplot(temp[ord,ord], col.regions = rev(myCol),
+          at = do.breaks(c(0, max(temp)), 128),
+          xlab = "", ylab = "",
+          ## Rotate labels of x-axis
+          scales = list(x = list(rot = 90)),
+          main = "(Dis-)Similarity Plot"))
+@
+\myinfig{1}{RFLPtools-sim2.pdf}
+Some bands may be missing ...
+<<sim3>>=
+## Euclidean distance
+data(RFLPdata)
+data(RFLPref)
+nrBands(RFLPdata)
+res0 <- RFLPdist2(RFLPdata, nrBands = 9, nrMissing = 0)
+res1 <- RFLPdist2(RFLPdata, nrBands = 9, nrMissing = 1)
+res2 <- RFLPdist2(RFLPdata, nrBands = 9, nrMissing = 2)
+res3 <- RFLPdist2(RFLPdata, nrBands = 9, nrMissing = 3)
+@
+<<sim4, fig = TRUE>>=
+## hierarchical clustering
+par(mfrow = c(2,2))
+plot(hclust(res0), main = "0 bands missing")
+plot(hclust(res1), main = "1 band missing")
+plot(hclust(res2), main = "2 bands missing")
+plot(hclust(res3), main = "3 bands missing")
+@
+\myinfig{1}{RFLPtools-sim4.pdf}
+Another possible visualization ...
+<<RFLPplot, fig = TRUE>>=
+par(mfrow = c(1,2))
+plot(hclust(RFLPdist(RFLPdata, nrBands = 3)), cex = 0.7)
+RFLPplot(RFLPdata, nrBands = 3, mar.bottom = 6, cex.axis = 0.8)
+@
+\myinfig{1}{RFLPtools-RFLPplot.pdf}
+Comparison to reference data ...
+<<RFLPrefplot, fig = TRUE>>=
+RFLPrefplot(RFLPdata, RFLPref, nrBands = 9, cex.axis = 0.8)
+@
+\myinfig{1}{RFLPtools-RFLPrefplot.pdf}
+%-------------------------------------------------------------------------------
+\section{BLAST data} 
+%-------------------------------------------------------------------------------
+To analyze tabular report files generated with standalone BLAST from NCBI 
+(cf.\ \url{ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release}), 
+a function for reading the BLAST report files is included (\code{read.blast}).  
+<<read.blast>>=
+Dir <- system.file("extdata", package = "RFLPtools") # input directory 
+filename <- file.path(Dir, "BLASTexample.txt")
+BLAST1 <- read.blast(file = filename)
+str(BLAST1)
+@
+This example BLAST data is also available as loadable example data.
+<<blast>>=
+data(BLASTdata)
+@
+The loaded \code{data.frame} can be used to compute similarities between the 
+BLASTed sequences via function \code{simMatrix}. This function includes the following 
+steps:
+\begin{enumerate}
+\item the length of each sequence ({\tt LS}) comprised in the input data file is extracted.
+\item if there is more than one comparison for one sequence including different parts of 
+the respective sequence, that one with maximum base length is chosen.
+\item the number of matching bases ({\tt mB}) is calculated by multiplying two variables 
+given in the BLAST output: the identity between sequences (\%) and the number of nucleotides 
+divided by 100. 
+\item the resulting value is rounded to the next integer. 
+\item the similarity is calculated by dividing {\tt mB} by {\tt LS} and saved in the
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/rflptools -r 2


More information about the Rflptools-commits mailing list