[Seqinr-commits] r2107 - in pkg: . R man

Wed Dec 16 11:12:14 CET 2020

Author: simonpenel
Date: 2020-12-16 11:12:14 +0100 (Wed, 16 Dec 2020)
New Revision: 2107

Modified:
   pkg/DESCRIPTION
   pkg/R/GC.R
   pkg/R/read.alignment.R
   pkg/man/GC.Rd
Log:
Modify GC.R and read.alignment.R according to Dr. Haruo Suzuki suggestions

Modified: pkg/DESCRIPTION
===================================================================

--- pkg/DESCRIPTION	2020-10-09 13:13:13 UTC (rev 2106)
+++ pkg/DESCRIPTION	2020-12-16 10:12:14 UTC (rev 2107)
@@ -1,7 +1,7 @@
 Encoding: UTF-8
 Package: seqinr
-Version: 4.2-4
-Date: 2020-10-8
+Version: 4.2-5
+Date: 2020-12-16
 Title: Biological Sequences Retrieval and Analysis
 Authors at R: c(person("Delphine", "Charif", role = "aut"),
              person("Olivier", "Clerc", role = "ctb"),
@@ -17,7 +17,7 @@
 Imports: ade4,segmented
 Depends: R (>= 2.10.0)
 Description: Exploratory data analysis and data visualization
-  for biological sequence (DNA and protein) data. Seqinr includes 
+  for biological sequence (DNA and protein) data. Seqinr includes
   utilities for sequence data management under the ACNUC system
   described in Gouy, M. et al. (1984) Nucleic Acids Res.
   12:121-127 <doi:10.1093/nar/12.1Part1.121>.

Modified: pkg/R/GC.R
===================================================================
--- pkg/R/GC.R	2020-10-09 13:13:13 UTC (rev 2106)
+++ pkg/R/GC.R	2020-12-16 10:12:14 UTC (rev 2107)
@@ -2,8 +2,11 @@
 # G+C content
 #################################
 
-GC <- function(seq, forceToLower = TRUE, exact = FALSE, NA.GC = NA, oldGC = FALSE )
+GC <- function(seq, forceToLower = TRUE, exact = FALSE, NA.GC = NA, oldGC = FALSE,  alphabet = s2c("acgtswmkryvhdb"))
 {
+
+
+
   #
   # NA propagation:
   #
@@ -16,14 +19,17 @@
   # Force to lower-case letters if requested:
   #
   if(forceToLower) seq <- tolower(seq)
+
+  seq <- seq[seq %in% alphabet]
+
   #
   # Compute the count of each base:
   #
   nc <- sum( seq == "c" )
-  ng <- sum( seq == "g" )			
+  ng <- sum( seq == "g" )
   na <- sum( seq == "a" )
   nt <- sum( seq == "t" )
-  
+
   #
   # oldGC case:
   #
@@ -31,7 +37,7 @@
     warning("argument oldGC is deprecated")
     return( (nc + ng)/length(seq) )
   }
-  
+
   #
   # General case:
   #
@@ -61,7 +67,7 @@
 		##########################
 		# Ambiguous base section #
 		##########################
-		
+
 		#
 		# m : Amino (a or c)
 		#
@@ -70,7 +76,7 @@
 			ngc <- ngc + nm*nc/(na + nc)
 		nat <- nat + nm*na/(na + nc)
 		}
-               
+
 		#
 		# k : Keto (g or t)
 		#
@@ -79,7 +85,7 @@
 			ngc <- ngc + nk*ng/(ng + nt)
 			nat <- nat + nk*nt/(ng + nt)
 		}
-		
+
 		#
 		# r : Purine (a or g)
 		#
@@ -87,8 +93,8 @@
 			nr <- sum( seq == "r" )
 			ngc <- ngc + nr*ng/(ng + na)
 			nat <- nat + nr*na/(ng + na)
-                      }      
-               
+                      }
+
 		#
 		# y : Pyrimidine (c or t)
 		#
@@ -97,7 +103,7 @@
 			ngc <- ngc + ny*nc/(nc + nt)
 			nat <- nat + ny*nt/(nc + nt)
 		}
-                
+
 		#
 		# v : not t (a, c or g)
 		#
@@ -179,4 +185,3 @@
 ######################
 
 GC3 <- function(seq, frame = 0, ...) GCpos(seq = seq, pos = 3, frame = frame, ...)
-

Modified: pkg/R/read.alignment.R
===================================================================
--- pkg/R/read.alignment.R	2020-10-09 13:13:13 UTC (rev 2106)
+++ pkg/R/read.alignment.R	2020-12-16 10:12:14 UTC (rev 2107)
@@ -4,17 +4,29 @@
 read.alignment <- function(file, format, forceToLower = TRUE, ...)
 {
   #
+  # Check if the file is an URL:
+  #
+  urlcheck = substr(file, 0,4)
+  if (urlcheck == "http") {
+    # Copy the contents to a tmpfile to be read later
+    lines <- readLines(file)
+    tmpaln <- tempfile(pattern = "readaln")
+    writeLines(lines,tmpaln)
+    file <-tmpaln
+  }
+
+  #
   # Check that we have read permission on the file:
-  # 
-  file <- path.expand(file) 
+  #
+  file <- path.expand(file)
   if(file.access(file, mode = 4) != 0) stop(paste("File", file, "is not readable"))
-  
+
   fasta2ali <- function(file, ...){
   	tmp <- read.fasta(file, as.string = TRUE, ...)
   	list(length(tmp), getName(tmp), unlist(getSequence(tmp, as.string = TRUE)))
   }
   ali <- switch( tolower(format),
-	fasta = fasta2ali(file, ...), 
+	fasta = fasta2ali(file, ...),
 	mase = .Call("read_mase", file, PACKAGE = "seqinr"),
 	phylip = .Call("read_phylip_align", file, PACKAGE = "seqinr"),
 	msf = .Call("read_msf_align", file, PACKAGE = "seqinr"),
@@ -27,7 +39,7 @@
   ali <- lapply(ali, function (x ){gsub ('\r','',x)})
   if(forceToLower) ali[[3]] <- lapply(ali[[3]], tolower)
   if(format == "mase"){
-    ali <- list(nb = as.numeric(ali[[1]]), nam = ali[[2]], seq = ali[[3]], com = ali[[4]]) 
+    ali <- list(nb = as.numeric(ali[[1]]), nam = ali[[2]], seq = ali[[3]], com = ali[[4]])
   } else {
     ali <- list(nb = as.numeric(ali[[1]]), nam = ali[[2]], seq = ali[[3]], com = NA)
   }

Modified: pkg/man/GC.Rd
===================================================================
--- pkg/man/GC.Rd	2020-10-09 13:13:13 UTC (rev 2106)
+++ pkg/man/GC.Rd	2020-12-16 10:12:14 UTC (rev 2107)
@@ -9,7 +9,7 @@
   Calculates the fraction of G+C bases of the input nucleic acid
   sequence(s). It reads in nucleic acid sequences, sums the number of
   'g' and 'c' bases and writes out the result as the fraction (in the
-  interval 0.0 to 1.0) to the total number of 'a', 'c', 'g' and 't' bases. 
+  interval 0.0 to 1.0) to the total number of 'a', 'c', 'g' and 't' bases.
   Global G+C content \code{GC}, G+C in the first position of the codon bases
   \code{GC1}, G+C in the second position of the codon bases
   \code{GC2}, and G+C in the third position of the codon bases
@@ -17,7 +17,7 @@
   into account when requested.
 }
 \usage{
-GC(seq, forceToLower = TRUE, exact = FALSE, NA.GC = NA, oldGC = FALSE)
+GC(seq, forceToLower = TRUE, exact = FALSE, NA.GC = NA, oldGC = FALSE,  alphabet = s2c("acgtswmkryvhdb"))
 GC1(seq, frame = 0, ...)
 GC2(seq, frame = 0, ...)
 GC3(seq, frame = 0, ...)
@@ -48,9 +48,11 @@
     as in seqinR <= 1.0-6, that is as the sum of 'g' and 'c' bases divided by
     the length of the sequence. As from seqinR >= 1.1-3, this argument is
     deprecated and a warning is issued.}
+  \item{alphabet}{alphabet used. This allows you to choose ambiguous bases used
+      during GC calculation.}
 }
 \details{
-	When \code{exact} is set to \code{TRUE} the G+C content is estimated 
+	When \code{exact} is set to \code{TRUE} the G+C content is estimated
 	with ambiguous bases taken into account. Note that this is time expensive.
 	A first pass is made on non-ambiguous bases to estimate the probabilities
 	of the four bases in the sequence. They are then used to weight the
@@ -59,13 +61,13 @@
 	suppose that there are nb bases 'b'. 'b' stands for "not a", that
 	is for 'c', 'g' or 't'. The contribution of 'b' bases to the GC base
 	count will be:
-	
+
 	nb*(nc + ng)/(nc + ng + nt)
-	
+
 	The contribution of 'b' bases to the AT base count will be:
-	
+
 	nb*nt/(nc + ng + nt)
-	
+
 	All ambiguous bases contributions to the AT and GC counts are weighted
 	is similar way and then the G+C content is computed as ngc/(nat + ngc).
 }
@@ -75,13 +77,13 @@
   \code{GC1}, \code{GC2}, \code{GC3} are wrappers for \code{GCpos} with the
     argument \code{pos} set to 1, 2, and 3, respectively.
   \code{NA} is returned when \code{seq} is \code{NA}.
-  \code{NA.GC} defaulting to \code{NA} is returned when the G+C content 
+  \code{NA.GC} defaulting to \code{NA} is returned when the G+C content
     can not be computed from data.
 }
 \references{
   \code{citation("seqinr")}.
-  
-  The program codonW used here for comparison is available at 
+
+  The program codonW used here for comparison is available at
   \url{http://codonw.sourceforge.net/}.
 }
 \seealso{You can use \code{\link{s2c}} to convert a string into a vetor of single
@@ -157,7 +159,7 @@
 res[, "size"] <- size
 
 for(i in seq_len(n)){
-  myseq <- sample(x = s2c("acgtws"), size = size[i], replace = TRUE)   
+  myseq <- sample(x = s2c("acgtws"), size = size[i], replace = TRUE)
   res[i, "FF"] <- system.time(GC(myseq, forceToLower = FALSE, exact = FALSE))[3]
   res[i, "FT"] <- system.time(GC(myseq, forceToLower = FALSE, exact = TRUE))[3]
   	res[i, "TF"] <- system.time(GC(myseq, forceToLower = TRUE, exact = FALSE))[3]
@@ -165,7 +167,7 @@
 }
 
 par(oma = c(0,0,2.5,0), mar = c(4,5,0,2) + 0.1, mfrow = c(2, 1))
-plot(res$size, res$TT, las = 1, 
+plot(res$size, res$TT, las = 1,
 xlab = "Sequence size [bp]",
 ylim = c(0, max(res$TT)), xlim = c(0, max(res$size)), ylab = "")
 title(ylab = "Observed time [s]", line = 4)
@@ -187,7 +189,7 @@
 mincpu <- lm(res$FF~res$size)$coef[2]
 
 barplot(
-c(lm(res$FF~res$size)$coef[2]/mincpu, 
+c(lm(res$FF~res$size)$coef[2]/mincpu,
   lm(res$TF~res$size)$coef[2]/mincpu,
   lm(res$FT~res$size)$coef[2]/mincpu,
   lm(res$TT~res$size)$coef[2]/mincpu),