[Seqinr-commits] r2107 - in pkg: . R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Wed Dec 16 11:12:14 CET 2020
Author: simonpenel
Date: 2020-12-16 11:12:14 +0100 (Wed, 16 Dec 2020)
New Revision: 2107
Modified:
pkg/DESCRIPTION
pkg/R/GC.R
pkg/R/read.alignment.R
pkg/man/GC.Rd
Log:
Modify GC.R and read.alignment.R according to Dr. Haruo Suzuki suggestions
Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION 2020-10-09 13:13:13 UTC (rev 2106)
+++ pkg/DESCRIPTION 2020-12-16 10:12:14 UTC (rev 2107)
@@ -1,7 +1,7 @@
Encoding: UTF-8
Package: seqinr
-Version: 4.2-4
-Date: 2020-10-8
+Version: 4.2-5
+Date: 2020-12-16
Title: Biological Sequences Retrieval and Analysis
Authors at R: c(person("Delphine", "Charif", role = "aut"),
person("Olivier", "Clerc", role = "ctb"),
@@ -17,7 +17,7 @@
Imports: ade4,segmented
Depends: R (>= 2.10.0)
Description: Exploratory data analysis and data visualization
- for biological sequence (DNA and protein) data. Seqinr includes
+ for biological sequence (DNA and protein) data. Seqinr includes
utilities for sequence data management under the ACNUC system
described in Gouy, M. et al. (1984) Nucleic Acids Res.
12:121-127 <doi:10.1093/nar/12.1Part1.121>.
Modified: pkg/R/GC.R
===================================================================
--- pkg/R/GC.R 2020-10-09 13:13:13 UTC (rev 2106)
+++ pkg/R/GC.R 2020-12-16 10:12:14 UTC (rev 2107)
@@ -2,8 +2,11 @@
# G+C content
#################################
-GC <- function(seq, forceToLower = TRUE, exact = FALSE, NA.GC = NA, oldGC = FALSE )
+GC <- function(seq, forceToLower = TRUE, exact = FALSE, NA.GC = NA, oldGC = FALSE, alphabet = s2c("acgtswmkryvhdb"))
{
+
+
+
#
# NA propagation:
#
@@ -16,14 +19,17 @@
# Force to lower-case letters if requested:
#
if(forceToLower) seq <- tolower(seq)
+
+ seq <- seq[seq %in% alphabet]
+
#
# Compute the count of each base:
#
nc <- sum( seq == "c" )
- ng <- sum( seq == "g" )
+ ng <- sum( seq == "g" )
na <- sum( seq == "a" )
nt <- sum( seq == "t" )
-
+
#
# oldGC case:
#
@@ -31,7 +37,7 @@
warning("argument oldGC is deprecated")
return( (nc + ng)/length(seq) )
}
-
+
#
# General case:
#
@@ -61,7 +67,7 @@
##########################
# Ambiguous base section #
##########################
-
+
#
# m : Amino (a or c)
#
@@ -70,7 +76,7 @@
ngc <- ngc + nm*nc/(na + nc)
nat <- nat + nm*na/(na + nc)
}
-
+
#
# k : Keto (g or t)
#
@@ -79,7 +85,7 @@
ngc <- ngc + nk*ng/(ng + nt)
nat <- nat + nk*nt/(ng + nt)
}
-
+
#
# r : Purine (a or g)
#
@@ -87,8 +93,8 @@
nr <- sum( seq == "r" )
ngc <- ngc + nr*ng/(ng + na)
nat <- nat + nr*na/(ng + na)
- }
-
+ }
+
#
# y : Pyrimidine (c or t)
#
@@ -97,7 +103,7 @@
ngc <- ngc + ny*nc/(nc + nt)
nat <- nat + ny*nt/(nc + nt)
}
-
+
#
# v : not t (a, c or g)
#
@@ -179,4 +185,3 @@
######################
GC3 <- function(seq, frame = 0, ...) GCpos(seq = seq, pos = 3, frame = frame, ...)
-
Modified: pkg/R/read.alignment.R
===================================================================
--- pkg/R/read.alignment.R 2020-10-09 13:13:13 UTC (rev 2106)
+++ pkg/R/read.alignment.R 2020-12-16 10:12:14 UTC (rev 2107)
@@ -4,17 +4,29 @@
read.alignment <- function(file, format, forceToLower = TRUE, ...)
{
#
+ # Check if the file is an URL:
+ #
+ urlcheck = substr(file, 0,4)
+ if (urlcheck == "http") {
+ # Copy the contents to a tmpfile to be read later
+ lines <- readLines(file)
+ tmpaln <- tempfile(pattern = "readaln")
+ writeLines(lines,tmpaln)
+ file <-tmpaln
+ }
+
+ #
# Check that we have read permission on the file:
- #
- file <- path.expand(file)
+ #
+ file <- path.expand(file)
if(file.access(file, mode = 4) != 0) stop(paste("File", file, "is not readable"))
-
+
fasta2ali <- function(file, ...){
tmp <- read.fasta(file, as.string = TRUE, ...)
list(length(tmp), getName(tmp), unlist(getSequence(tmp, as.string = TRUE)))
}
ali <- switch( tolower(format),
- fasta = fasta2ali(file, ...),
+ fasta = fasta2ali(file, ...),
mase = .Call("read_mase", file, PACKAGE = "seqinr"),
phylip = .Call("read_phylip_align", file, PACKAGE = "seqinr"),
msf = .Call("read_msf_align", file, PACKAGE = "seqinr"),
@@ -27,7 +39,7 @@
ali <- lapply(ali, function (x ){gsub ('\r','',x)})
if(forceToLower) ali[[3]] <- lapply(ali[[3]], tolower)
if(format == "mase"){
- ali <- list(nb = as.numeric(ali[[1]]), nam = ali[[2]], seq = ali[[3]], com = ali[[4]])
+ ali <- list(nb = as.numeric(ali[[1]]), nam = ali[[2]], seq = ali[[3]], com = ali[[4]])
} else {
ali <- list(nb = as.numeric(ali[[1]]), nam = ali[[2]], seq = ali[[3]], com = NA)
}
Modified: pkg/man/GC.Rd
===================================================================
--- pkg/man/GC.Rd 2020-10-09 13:13:13 UTC (rev 2106)
+++ pkg/man/GC.Rd 2020-12-16 10:12:14 UTC (rev 2107)
@@ -9,7 +9,7 @@
Calculates the fraction of G+C bases of the input nucleic acid
sequence(s). It reads in nucleic acid sequences, sums the number of
'g' and 'c' bases and writes out the result as the fraction (in the
- interval 0.0 to 1.0) to the total number of 'a', 'c', 'g' and 't' bases.
+ interval 0.0 to 1.0) to the total number of 'a', 'c', 'g' and 't' bases.
Global G+C content \code{GC}, G+C in the first position of the codon bases
\code{GC1}, G+C in the second position of the codon bases
\code{GC2}, and G+C in the third position of the codon bases
@@ -17,7 +17,7 @@
into account when requested.
}
\usage{
-GC(seq, forceToLower = TRUE, exact = FALSE, NA.GC = NA, oldGC = FALSE)
+GC(seq, forceToLower = TRUE, exact = FALSE, NA.GC = NA, oldGC = FALSE, alphabet = s2c("acgtswmkryvhdb"))
GC1(seq, frame = 0, ...)
GC2(seq, frame = 0, ...)
GC3(seq, frame = 0, ...)
@@ -48,9 +48,11 @@
as in seqinR <= 1.0-6, that is as the sum of 'g' and 'c' bases divided by
the length of the sequence. As from seqinR >= 1.1-3, this argument is
deprecated and a warning is issued.}
+ \item{alphabet}{alphabet used. This allows you to choose ambiguous bases used
+ during GC calculation.}
}
\details{
- When \code{exact} is set to \code{TRUE} the G+C content is estimated
+ When \code{exact} is set to \code{TRUE} the G+C content is estimated
with ambiguous bases taken into account. Note that this is time expensive.
A first pass is made on non-ambiguous bases to estimate the probabilities
of the four bases in the sequence. They are then used to weight the
@@ -59,13 +61,13 @@
suppose that there are nb bases 'b'. 'b' stands for "not a", that
is for 'c', 'g' or 't'. The contribution of 'b' bases to the GC base
count will be:
-
+
nb*(nc + ng)/(nc + ng + nt)
-
+
The contribution of 'b' bases to the AT base count will be:
-
+
nb*nt/(nc + ng + nt)
-
+
All ambiguous bases contributions to the AT and GC counts are weighted
is similar way and then the G+C content is computed as ngc/(nat + ngc).
}
@@ -75,13 +77,13 @@
\code{GC1}, \code{GC2}, \code{GC3} are wrappers for \code{GCpos} with the
argument \code{pos} set to 1, 2, and 3, respectively.
\code{NA} is returned when \code{seq} is \code{NA}.
- \code{NA.GC} defaulting to \code{NA} is returned when the G+C content
+ \code{NA.GC} defaulting to \code{NA} is returned when the G+C content
can not be computed from data.
}
\references{
\code{citation("seqinr")}.
-
- The program codonW used here for comparison is available at
+
+ The program codonW used here for comparison is available at
\url{http://codonw.sourceforge.net/}.
}
\seealso{You can use \code{\link{s2c}} to convert a string into a vetor of single
@@ -157,7 +159,7 @@
res[, "size"] <- size
for(i in seq_len(n)){
- myseq <- sample(x = s2c("acgtws"), size = size[i], replace = TRUE)
+ myseq <- sample(x = s2c("acgtws"), size = size[i], replace = TRUE)
res[i, "FF"] <- system.time(GC(myseq, forceToLower = FALSE, exact = FALSE))[3]
res[i, "FT"] <- system.time(GC(myseq, forceToLower = FALSE, exact = TRUE))[3]
res[i, "TF"] <- system.time(GC(myseq, forceToLower = TRUE, exact = FALSE))[3]
@@ -165,7 +167,7 @@
}
par(oma = c(0,0,2.5,0), mar = c(4,5,0,2) + 0.1, mfrow = c(2, 1))
-plot(res$size, res$TT, las = 1,
+plot(res$size, res$TT, las = 1,
xlab = "Sequence size [bp]",
ylim = c(0, max(res$TT)), xlim = c(0, max(res$size)), ylab = "")
title(ylab = "Observed time [s]", line = 4)
@@ -187,7 +189,7 @@
mincpu <- lm(res$FF~res$size)$coef[2]
barplot(
-c(lm(res$FF~res$size)$coef[2]/mincpu,
+c(lm(res$FF~res$size)$coef[2]/mincpu,
lm(res$TF~res$size)$coef[2]/mincpu,
lm(res$FT~res$size)$coef[2]/mincpu,
lm(res$TT~res$size)$coef[2]/mincpu),
More information about the Seqinr-commits
mailing list