[CHNOSZ-commits] r304 - in pkg/CHNOSZ: . R inst man src tests/testthat

Sat Feb 17 18:47:07 CET 2018

Author: jedick
Date: 2018-02-17 18:47:07 +0100 (Sat, 17 Feb 2018)
New Revision: 304

Added:
   pkg/CHNOSZ/src/count_letters.c
Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/NAMESPACE
   pkg/CHNOSZ/R/util.fasta.R
   pkg/CHNOSZ/R/water.R
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/man/util.fasta.Rd
   pkg/CHNOSZ/src/init.c
   pkg/CHNOSZ/tests/testthat/test-util.seq.R
Log:
count.aa(): improve performance by using C code (src/count_letters.c)


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/DESCRIPTION	2018-02-17 17:47:07 UTC (rev 304)
@@ -1,6 +1,6 @@
-Date: 2018-01-28
+Date: 2018-02-18
 Package: CHNOSZ
-Version: 1.1.3-11
+Version: 1.1.3-12
 Title: Thermodynamic Calculations for Geobiochemistry
 Authors at R: c(
     person("Jeffrey", "Dick", , "j3ffdick at gmail.com", role = c("aut", "cre"),

Modified: pkg/CHNOSZ/NAMESPACE
===================================================================
--- pkg/CHNOSZ/NAMESPACE	2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/NAMESPACE	2018-02-17 17:47:07 UTC (rev 304)
@@ -1,7 +1,3 @@
-## Export all names except those beginning with F_
-## (so we don't export F_h2o92 created below)
-#exportPattern("^([^F]|F($|[^_])).*")
-
 # exports starting with functions used in vignettes, examples, demos, tests 20170224
 export(
 # anintro vignette
@@ -64,7 +60,8 @@
 )
 
 # Load shared objects
-useDynLib(CHNOSZ, .registration = TRUE, .fixes = "F_")
+# Refer to all C/Fortran routines by their name prefixed by C_
+useDynLib(CHNOSZ, .registration = TRUE, .fixes = "C_")
 
 # Imports from default packages
 importFrom("grDevices", "dev.cur", "dev.off", "extendrange",

Modified: pkg/CHNOSZ/R/util.fasta.R
===================================================================
--- pkg/CHNOSZ/R/util.fasta.R	2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/R/util.fasta.R	2018-02-17 17:47:07 UTC (rev 304)
@@ -14,28 +14,24 @@
   #   fas: fasta entry
   # value of 'id' is used for 'protein' in output table,
   #   otherwise ID is parsed from FASTA header (can take a while)
-  is.nix <- Sys.info()[[1]]=="Linux"
-  if(is.nix & is.null(lines)) {
-    message("read.fasta: reading ",basename(file))
-    # figure out whether to use 'cat', 'zcat' or 'xzcat'
-    suffix <- substr(file,nchar(file)-2,nchar(file))
-    if(suffix==".gz") mycat <- "zcat"
-    else if(suffix==".xz") mycat <- "xzcat"
-    else mycat <- "cat"
-    nlines <- as.integer(system(paste(mycat,' "',file,'" | wc -l',sep=""),intern=TRUE))
-    ihead <- as.integer(system(paste(mycat,' "',file,'" | grep -n "^>" | cut -f 1 -d ":"',sep=""),intern=TRUE))
-    #linefun <- function(i1,i2) as.character(system(paste('sed -n ',i1,',',i2,'p ',file,sep=""),intern=TRUE))
-    lines <- system(paste(mycat,' "',file,'"',sep=""),intern=TRUE)
-    linefun <- function(i1,i2) lines[i1:i2]
-  } else {
-    if(is.null(lines)) {
-      lines <- readLines(file)
-      message("read.fasta: reading ",basename(file))
-    }
-    nlines <- length(lines)
-    if(is.null(ihead)) ihead <- which(substr(lines,1,1)==">")
-    linefun <- function(i1,i2) lines[i1:i2]
+  if(is.null(lines)) {
+    message("read.fasta: reading ", basename(file), " ... ", appendLF=FALSE)
+    #lines <- readLines(file)
+    is.nix <- Sys.info()[[1]]=="Linux"
+    if(is.nix) {
+      # figure out whether to use 'cat', 'zcat' or 'xzcat'
+      suffix <- substr(file,nchar(file)-2,nchar(file))
+      if(suffix==".gz") mycat <- "zcat"
+      else if(suffix==".xz") mycat <- "xzcat"
+      else mycat <- "cat"
+      lines <- system(paste(mycat,' "',file,'"',sep=""),intern=TRUE)
+    } else lines <- scan(file, what=character(), sep="\n", quiet=TRUE)
   }
+  nlines <- length(lines)
+  message(nlines, " lines ... ", appendLF=FALSE)
+  if(is.null(ihead)) ihead <- which(substr(lines,1,1)==">")
+  message(length(ihead), " sequence headers")
+  linefun <- function(i1,i2) lines[i1:i2]
   # identify the lines that begin and end each sequence
   begin <- ihead + 1
   end <- ihead - 1
@@ -143,10 +139,13 @@
 
 count.aa <- function(seq, start=NULL, stop=NULL, type="protein") {
   # count amino acids or DNA bases in one or more sequences given as elements of the list seq
-  # put them in alphabetical order (amino acids: same order as in thermo$protein)
   if(type=="protein") letts <- aminoacids(1)
   else if(type=="DNA") letts <- c("A", "C", "G", "T")
   else stop(paste("unknown sequence type", type))
+  # the numerical positions of the letters in alphabetical order (i.e. for amino acids, same order as in thermo$protein)
+  ilett <- match(letts, LETTERS)
+  # the letters A-Z represented by raw values
+  rawAZ <- charToRaw("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
   # to count the letters in each sequence
   countfun <- function(seq, start, stop) {
     # get a substring if one or both of start or stop are given
@@ -157,24 +156,27 @@
     } else if(!is.null(stop)) {
       seq <- substr(seq, 1, stop)
     }
-    # the actual counting
-    nnn <- table(strsplit(toupper(seq), "")[[1]])
-    # get them in alphabetical order
-    ilett <- match(names(nnn), letts)
-    # in case any letters aren't in our alphabet
-    ina <- is.na(ilett)
-    if(any(ina)) message(paste("count.aa: unrecognized letter(s) in", type, "sequence:",
-      paste(names(nnn)[ina], collapse=" ")))
-    count <- numeric(length(letts))
-    count[ilett[!ina]] <- nnn[!ina]
-    return(count)
+    ## the actual counting ...
+    #nnn <- table(strsplit(toupper(seq), "")[[1]])
+    # ... replaced with C version 20180217
+    counts <- .C(C_count_letters, seq, integer(26))[[2]]
+    # which is equivalent to this R code:
+    #rawseq <- charToRaw(toupper(seq))
+    #counts <- sapply(rawAZ, function(x) sum(rawseq == x))
+    return(counts)
   }
   # counts for each sequence
-  a <- palply("", seq, countfun, start, stop)
-  a <- t(as.data.frame(a, optional=TRUE))
+  counts <- palply("", seq, countfun, start, stop)
+  counts <- do.call(rbind, counts)
+  # check for letters that aren't in our alphabet
+  ina <- colSums(counts[, -ilett, drop=FALSE]) > 0
+  if(any(ina)) {
+    message(paste("count.aa: unrecognized letter(s) in", type, "sequence:", paste(LETTERS[-ilett][ina], collapse=" ")))
+  }
+  counts <- counts[, ilett, drop=FALSE]
   # clean up row/column names
-  colnames(a) <- letts
-  rownames(a) <- 1:nrow(a)
-  return(a)
+  colnames(counts) <- letts
+  rownames(counts) <- 1:nrow(counts)
+  return(counts)
 }
 

Modified: pkg/CHNOSZ/R/water.R
===================================================================
--- pkg/CHNOSZ/R/water.R	2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/R/water.R	2018-02-17 17:47:07 UTC (rev 304)
@@ -82,7 +82,7 @@
       rho.out[i] <- NA
     } else {
       # now to the actual calculations
-      H2O <- .Fortran(F_h2o92, as.integer(specs), as.double(states),
+      H2O <- .Fortran(C_h2o92, as.integer(specs), as.double(states),
         as.double(rep(0, 46)), as.integer(0))
       # errors
       err.out[i] <- H2O[[4]]

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/inst/NEWS	2018-02-17 17:47:07 UTC (rev 304)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 1.1.3-11 (2018-01-28)
+CHANGES IN CHNOSZ 1.1.3-12 (2018-02-18)
 ---------------------------------------
 
 - Lines in 1-D diagram()s can optionally be drawn as splines using the
@@ -25,9 +25,12 @@
 - In equilibrate(), accept a length > 1 'normalize' argument in order
   normalize the chemical formulas of only the selected species.
 
-- Export thermo.axis(), as it is useful for adding major&minor tick
+- Export thermo.axis(), as it is useful for adding major and minor tick
   marks after (above) other plot elements such as legends.
 
+- Add C implementation of counting occurrences of all letters in a
+  string (src/count_letters.c) to speed up operation of count.aa().
+
 CHANGES IN CHNOSZ 1.1.3 (2017-11-13)
 ------------------------------------
 

Modified: pkg/CHNOSZ/man/util.fasta.Rd
===================================================================
--- pkg/CHNOSZ/man/util.fasta.Rd	2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/man/util.fasta.Rd	2018-02-17 17:47:07 UTC (rev 304)
@@ -36,7 +36,6 @@
 Use \code{iseq} to select the sequences to read (the default is all sequences).
 The function returns various formats depending on the value of \code{ret}.
 The default \samp{count} returns a data frame of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences).
-This function utilizes the OS's \samp{grep} on supported operating systems in order to identify the header lines as well as \samp{cat} to read the file, otherwise \code{\link{readLines}} and \R's \code{\link{substr}} are used to read the file and locate the header lines.
 If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}. 
 Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to "").
 When \code{ret} is \samp{count}, the names of the proteins in the resulting data frame are parsed from the header lines of the file, unless \code{id} is provided.
@@ -44,7 +43,7 @@
 
 \code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}).
 For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}.
-Letters are matched without regard for case.
+The matching of letters is case-insensitive.
 A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid or nucleobase abbreviations.
 \code{start} and/or \code{stop} can be provided to count a fragment of the sequence (extracted using \code{\link{substr}}).
 If only one of \code{start} or \code{stop} is present, the other defaults to 1 (\code{start}) or the length of the sequence (\code{stop}).
@@ -52,7 +51,6 @@
 \code{uniprot.aa} returns a data frame of amino acid composition, in the format of \code{thermo$protein}, retrieved from the protein sequence if it is available from UniProt (\url{http://uniprot.org}).
 The \code{protein} argument corresponds to the \samp{Entry name} on the UniProt search pages.
 
-
 }
 
 \value{

Added: pkg/CHNOSZ/src/count_letters.c
===================================================================
--- pkg/CHNOSZ/src/count_letters.c	                        (rev 0)
+++ pkg/CHNOSZ/src/count_letters.c	2018-02-17 17:47:07 UTC (rev 304)
@@ -0,0 +1,20 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* [20180217] adapted from https://stackoverflow.com/questions/13213422/count-the-number-of-occurrences-of-each-letter-in-string */
+
+void count_letters(char **chararray, int *counts) {
+
+  char *str = chararray[0];
+
+  int i;
+  size_t len = strlen(str);
+
+  for (i = 0; i < len; i++) {
+      char c = str[i];
+      if (!isalpha(c)) continue;
+      counts[(int)(tolower(c) - 'a')]++;
+  }
+
+}

Modified: pkg/CHNOSZ/src/init.c
===================================================================
--- pkg/CHNOSZ/src/init.c	2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/src/init.c	2018-02-17 17:47:07 UTC (rev 304)
@@ -10,8 +10,21 @@
     {NULL, NULL, 0}
 };
 
+/* .C calls */
+extern void count_letters(void *, void *);
+
+static const R_CMethodDef CEntries[] = {
+    {"count_letters", (DL_FUNC) &count_letters, 2},
+    {NULL, NULL, 0}
+};
+
 void R_init_CHNOSZ(DllInfo *dll)
 {
+    R_registerRoutines(dll, CEntries, NULL, FortranEntries, NULL);
+/*
     R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL);
+    R_registerRoutines(dll, CEntries, NULL, NULL, NULL);
+*/
     R_useDynamicSymbols(dll, FALSE);
 }
+

Modified: pkg/CHNOSZ/tests/testthat/test-util.seq.R
===================================================================
--- pkg/CHNOSZ/tests/testthat/test-util.seq.R	2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/tests/testthat/test-util.seq.R	2018-02-17 17:47:07 UTC (rev 304)
@@ -17,3 +17,9 @@
   # ACG -> UGC (RNA complement)
   expect_equal(nucleic.formula(nucleic.complement(dna, "RNA")), "C13H14N10O4")
 })
+
+test_that("count.aa() correctly processes a longer nucleobase sequence", {
+  seq <- "ATGTCCCGTTTCTTAGTTGCATTGGTTGCCGCACTTTTAGGAGTTGCAATTGAGATGTCCCTTCTCGTTCGCGCTCAGGGGCAGCAAACCTTGCTTTTGGCTGAAGAAAGCAAGCATTTGTCGCAATTGCGTCAACTGACTTTTGAAGGCACCAATGCCGAAGCGTATTGGTCGCCTGACGGGAAATGGTTGGTCTTTCAATCCACACGCCCACCTTACAAGGCTGACCAAATCTTCATCATGAGAGCGGATGGCTCGGGAGTTCGTGTCGTCAGCACGGGCAAAGGTCGTTGCACTTGTGCCTATTTCACGCCAGATGGCAAAGGCGTTATCTTTGCTACGACCCACCTTGCTGGACCAGAACCGCCGCAAGTGCCCAAACTGGACATTCCACGCTATGTTTGGGGCGTGTTCCCAAGTTACGAACTTTACCTGCGGCGTTTGGACACGATGGAACTTATCCGCTTGACCGATAACGAAGGCTACGACGCTGAAGCGACCATTTGCTGGAAGACTGGGCGAATTGTCTTCACAAGTTACCGCAATGGCGACCTTGACCTTTACAGCATGAAATTAGACGGCAGCGATTTGAAGCGATTGACGAAAACCATCGGCTACGAGGGCGGAGCGTTCTACTCGCCCGACGGGAAGCGGATTGTCTTCCGAGCCTATTTGCCAAAGACGCCTGACGAAATTGACGAATACAAGCGGTTGCTCCAGTTAGGCGTCATAAGCCCACCAAAGATGGAGTGGGTCGTCATGGACGCCGACGGTCGCAACATGAAGCAAATC"
+  counts <- data.frame(A=190, C=203, G=211, T=188)
+  expect_equal(as.numeric(count.aa(seq, type="DNA")), as.numeric(counts))
+})