[CHNOSZ-commits] r304 - in pkg/CHNOSZ: . R inst man src tests/testthat
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sat Feb 17 18:47:07 CET 2018
Author: jedick
Date: 2018-02-17 18:47:07 +0100 (Sat, 17 Feb 2018)
New Revision: 304
Added:
pkg/CHNOSZ/src/count_letters.c
Modified:
pkg/CHNOSZ/DESCRIPTION
pkg/CHNOSZ/NAMESPACE
pkg/CHNOSZ/R/util.fasta.R
pkg/CHNOSZ/R/water.R
pkg/CHNOSZ/inst/NEWS
pkg/CHNOSZ/man/util.fasta.Rd
pkg/CHNOSZ/src/init.c
pkg/CHNOSZ/tests/testthat/test-util.seq.R
Log:
count.aa(): improve performance by using C code (src/count_letters.c)
Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================
--- pkg/CHNOSZ/DESCRIPTION 2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/DESCRIPTION 2018-02-17 17:47:07 UTC (rev 304)
@@ -1,6 +1,6 @@
-Date: 2018-01-28
+Date: 2018-02-18
Package: CHNOSZ
-Version: 1.1.3-11
+Version: 1.1.3-12
Title: Thermodynamic Calculations for Geobiochemistry
Authors at R: c(
person("Jeffrey", "Dick", , "j3ffdick at gmail.com", role = c("aut", "cre"),
Modified: pkg/CHNOSZ/NAMESPACE
===================================================================
--- pkg/CHNOSZ/NAMESPACE 2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/NAMESPACE 2018-02-17 17:47:07 UTC (rev 304)
@@ -1,7 +1,3 @@
-## Export all names except those beginning with F_
-## (so we don't export F_h2o92 created below)
-#exportPattern("^([^F]|F($|[^_])).*")
-
# exports starting with functions used in vignettes, examples, demos, tests 20170224
export(
# anintro vignette
@@ -64,7 +60,8 @@
)
# Load shared objects
-useDynLib(CHNOSZ, .registration = TRUE, .fixes = "F_")
+# Refer to all C/Fortran routines by their name prefixed by C_
+useDynLib(CHNOSZ, .registration = TRUE, .fixes = "C_")
# Imports from default packages
importFrom("grDevices", "dev.cur", "dev.off", "extendrange",
Modified: pkg/CHNOSZ/R/util.fasta.R
===================================================================
--- pkg/CHNOSZ/R/util.fasta.R 2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/R/util.fasta.R 2018-02-17 17:47:07 UTC (rev 304)
@@ -14,28 +14,24 @@
# fas: fasta entry
# value of 'id' is used for 'protein' in output table,
# otherwise ID is parsed from FASTA header (can take a while)
- is.nix <- Sys.info()[[1]]=="Linux"
- if(is.nix & is.null(lines)) {
- message("read.fasta: reading ",basename(file))
- # figure out whether to use 'cat', 'zcat' or 'xzcat'
- suffix <- substr(file,nchar(file)-2,nchar(file))
- if(suffix==".gz") mycat <- "zcat"
- else if(suffix==".xz") mycat <- "xzcat"
- else mycat <- "cat"
- nlines <- as.integer(system(paste(mycat,' "',file,'" | wc -l',sep=""),intern=TRUE))
- ihead <- as.integer(system(paste(mycat,' "',file,'" | grep -n "^>" | cut -f 1 -d ":"',sep=""),intern=TRUE))
- #linefun <- function(i1,i2) as.character(system(paste('sed -n ',i1,',',i2,'p ',file,sep=""),intern=TRUE))
- lines <- system(paste(mycat,' "',file,'"',sep=""),intern=TRUE)
- linefun <- function(i1,i2) lines[i1:i2]
- } else {
- if(is.null(lines)) {
- lines <- readLines(file)
- message("read.fasta: reading ",basename(file))
- }
- nlines <- length(lines)
- if(is.null(ihead)) ihead <- which(substr(lines,1,1)==">")
- linefun <- function(i1,i2) lines[i1:i2]
+ if(is.null(lines)) {
+ message("read.fasta: reading ", basename(file), " ... ", appendLF=FALSE)
+ #lines <- readLines(file)
+ is.nix <- Sys.info()[[1]]=="Linux"
+ if(is.nix) {
+ # figure out whether to use 'cat', 'zcat' or 'xzcat'
+ suffix <- substr(file,nchar(file)-2,nchar(file))
+ if(suffix==".gz") mycat <- "zcat"
+ else if(suffix==".xz") mycat <- "xzcat"
+ else mycat <- "cat"
+ lines <- system(paste(mycat,' "',file,'"',sep=""),intern=TRUE)
+ } else lines <- scan(file, what=character(), sep="\n", quiet=TRUE)
}
+ nlines <- length(lines)
+ message(nlines, " lines ... ", appendLF=FALSE)
+ if(is.null(ihead)) ihead <- which(substr(lines,1,1)==">")
+ message(length(ihead), " sequence headers")
+ linefun <- function(i1,i2) lines[i1:i2]
# identify the lines that begin and end each sequence
begin <- ihead + 1
end <- ihead - 1
@@ -143,10 +139,13 @@
count.aa <- function(seq, start=NULL, stop=NULL, type="protein") {
# count amino acids or DNA bases in one or more sequences given as elements of the list seq
- # put them in alphabetical order (amino acids: same order as in thermo$protein)
if(type=="protein") letts <- aminoacids(1)
else if(type=="DNA") letts <- c("A", "C", "G", "T")
else stop(paste("unknown sequence type", type))
+ # the numerical positions of the letters in alphabetical order (i.e. for amino acids, same order as in thermo$protein)
+ ilett <- match(letts, LETTERS)
+ # the letters A-Z represented by raw values
+ rawAZ <- charToRaw("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
# to count the letters in each sequence
countfun <- function(seq, start, stop) {
# get a substring if one or both of start or stop are given
@@ -157,24 +156,27 @@
} else if(!is.null(stop)) {
seq <- substr(seq, 1, stop)
}
- # the actual counting
- nnn <- table(strsplit(toupper(seq), "")[[1]])
- # get them in alphabetical order
- ilett <- match(names(nnn), letts)
- # in case any letters aren't in our alphabet
- ina <- is.na(ilett)
- if(any(ina)) message(paste("count.aa: unrecognized letter(s) in", type, "sequence:",
- paste(names(nnn)[ina], collapse=" ")))
- count <- numeric(length(letts))
- count[ilett[!ina]] <- nnn[!ina]
- return(count)
+ ## the actual counting ...
+ #nnn <- table(strsplit(toupper(seq), "")[[1]])
+ # ... replaced with C version 20180217
+ counts <- .C(C_count_letters, seq, integer(26))[[2]]
+ # which is equivalent to this R code:
+ #rawseq <- charToRaw(toupper(seq))
+ #counts <- sapply(rawAZ, function(x) sum(rawseq == x))
+ return(counts)
}
# counts for each sequence
- a <- palply("", seq, countfun, start, stop)
- a <- t(as.data.frame(a, optional=TRUE))
+ counts <- palply("", seq, countfun, start, stop)
+ counts <- do.call(rbind, counts)
+ # check for letters that aren't in our alphabet
+ ina <- colSums(counts[, -ilett, drop=FALSE]) > 0
+ if(any(ina)) {
+ message(paste("count.aa: unrecognized letter(s) in", type, "sequence:", paste(LETTERS[-ilett][ina], collapse=" ")))
+ }
+ counts <- counts[, ilett, drop=FALSE]
# clean up row/column names
- colnames(a) <- letts
- rownames(a) <- 1:nrow(a)
- return(a)
+ colnames(counts) <- letts
+ rownames(counts) <- 1:nrow(counts)
+ return(counts)
}
Modified: pkg/CHNOSZ/R/water.R
===================================================================
--- pkg/CHNOSZ/R/water.R 2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/R/water.R 2018-02-17 17:47:07 UTC (rev 304)
@@ -82,7 +82,7 @@
rho.out[i] <- NA
} else {
# now to the actual calculations
- H2O <- .Fortran(F_h2o92, as.integer(specs), as.double(states),
+ H2O <- .Fortran(C_h2o92, as.integer(specs), as.double(states),
as.double(rep(0, 46)), as.integer(0))
# errors
err.out[i] <- H2O[[4]]
Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS 2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/inst/NEWS 2018-02-17 17:47:07 UTC (rev 304)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 1.1.3-11 (2018-01-28)
+CHANGES IN CHNOSZ 1.1.3-12 (2018-02-18)
---------------------------------------
- Lines in 1-D diagram()s can optionally be drawn as splines using the
@@ -25,9 +25,12 @@
- In equilibrate(), accept a length > 1 'normalize' argument in order
normalize the chemical formulas of only the selected species.
-- Export thermo.axis(), as it is useful for adding major&minor tick
+- Export thermo.axis(), as it is useful for adding major and minor tick
marks after (above) other plot elements such as legends.
+- Add C implementation of counting occurrences of all letters in a
+ string (src/count_letters.c) to speed up operation of count.aa().
+
CHANGES IN CHNOSZ 1.1.3 (2017-11-13)
------------------------------------
Modified: pkg/CHNOSZ/man/util.fasta.Rd
===================================================================
--- pkg/CHNOSZ/man/util.fasta.Rd 2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/man/util.fasta.Rd 2018-02-17 17:47:07 UTC (rev 304)
@@ -36,7 +36,6 @@
Use \code{iseq} to select the sequences to read (the default is all sequences).
The function returns various formats depending on the value of \code{ret}.
The default \samp{count} returns a data frame of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences).
-This function utilizes the OS's \samp{grep} on supported operating systems in order to identify the header lines as well as \samp{cat} to read the file, otherwise \code{\link{readLines}} and \R's \code{\link{substr}} are used to read the file and locate the header lines.
If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}.
Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to "").
When \code{ret} is \samp{count}, the names of the proteins in the resulting data frame are parsed from the header lines of the file, unless \code{id} is provided.
@@ -44,7 +43,7 @@
\code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}).
For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}.
-Letters are matched without regard for case.
+The matching of letters is case-insensitive.
A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid or nucleobase abbreviations.
\code{start} and/or \code{stop} can be provided to count a fragment of the sequence (extracted using \code{\link{substr}}).
If only one of \code{start} or \code{stop} is present, the other defaults to 1 (\code{start}) or the length of the sequence (\code{stop}).
@@ -52,7 +51,6 @@
\code{uniprot.aa} returns a data frame of amino acid composition, in the format of \code{thermo$protein}, retrieved from the protein sequence if it is available from UniProt (\url{http://uniprot.org}).
The \code{protein} argument corresponds to the \samp{Entry name} on the UniProt search pages.
-
}
\value{
Added: pkg/CHNOSZ/src/count_letters.c
===================================================================
--- pkg/CHNOSZ/src/count_letters.c (rev 0)
+++ pkg/CHNOSZ/src/count_letters.c 2018-02-17 17:47:07 UTC (rev 304)
@@ -0,0 +1,20 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* [20180217] adapted from https://stackoverflow.com/questions/13213422/count-the-number-of-occurrences-of-each-letter-in-string */
+
+void count_letters(char **chararray, int *counts) {
+
+ char *str = chararray[0];
+
+ int i;
+ size_t len = strlen(str);
+
+ for (i = 0; i < len; i++) {
+ char c = str[i];
+ if (!isalpha(c)) continue;
+ counts[(int)(tolower(c) - 'a')]++;
+ }
+
+}
Modified: pkg/CHNOSZ/src/init.c
===================================================================
--- pkg/CHNOSZ/src/init.c 2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/src/init.c 2018-02-17 17:47:07 UTC (rev 304)
@@ -10,8 +10,21 @@
{NULL, NULL, 0}
};
+/* .C calls */
+extern void count_letters(void *, void *);
+
+static const R_CMethodDef CEntries[] = {
+ {"count_letters", (DL_FUNC) &count_letters, 2},
+ {NULL, NULL, 0}
+};
+
void R_init_CHNOSZ(DllInfo *dll)
{
+ R_registerRoutines(dll, CEntries, NULL, FortranEntries, NULL);
+/*
R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL);
+ R_registerRoutines(dll, CEntries, NULL, NULL, NULL);
+*/
R_useDynamicSymbols(dll, FALSE);
}
+
Modified: pkg/CHNOSZ/tests/testthat/test-util.seq.R
===================================================================
--- pkg/CHNOSZ/tests/testthat/test-util.seq.R 2018-01-28 08:22:34 UTC (rev 303)
+++ pkg/CHNOSZ/tests/testthat/test-util.seq.R 2018-02-17 17:47:07 UTC (rev 304)
@@ -17,3 +17,9 @@
# ACG -> UGC (RNA complement)
expect_equal(nucleic.formula(nucleic.complement(dna, "RNA")), "C13H14N10O4")
})
+
+test_that("count.aa() correctly processes a longer nucleobase sequence", {
+ seq <- "ATGTCCCGTTTCTTAGTTGCATTGGTTGCCGCACTTTTAGGAGTTGCAATTGAGATGTCCCTTCTCGTTCGCGCTCAGGGGCAGCAAACCTTGCTTTTGGCTGAAGAAAGCAAGCATTTGTCGCAATTGCGTCAACTGACTTTTGAAGGCACCAATGCCGAAGCGTATTGGTCGCCTGACGGGAAATGGTTGGTCTTTCAATCCACACGCCCACCTTACAAGGCTGACCAAATCTTCATCATGAGAGCGGATGGCTCGGGAGTTCGTGTCGTCAGCACGGGCAAAGGTCGTTGCACTTGTGCCTATTTCACGCCAGATGGCAAAGGCGTTATCTTTGCTACGACCCACCTTGCTGGACCAGAACCGCCGCAAGTGCCCAAACTGGACATTCCACGCTATGTTTGGGGCGTGTTCCCAAGTTACGAACTTTACCTGCGGCGTTTGGACACGATGGAACTTATCCGCTTGACCGATAACGAAGGCTACGACGCTGAAGCGACCATTTGCTGGAAGACTGGGCGAATTGTCTTCACAAGTTACCGCAATGGCGACCTTGACCTTTACAGCATGAAATTAGACGGCAGCGATTTGAAGCGATTGACGAAAACCATCGGCTACGAGGGCGGAGCGTTCTACTCGCCCGACGGGAAGCGGATTGTCTTCCGAGCCTATTTGCCAAAGACGCCTGACGAAATTGACGAATACAAGCGGTTGCTCCAGTTAGGCGTCATAAGCCCACCAAAGATGGAGTGGGTCGTCATGGACGCCGACGGTCGCAACATGAAGCAAATC"
+ counts <- data.frame(A=190, C=203, G=211, T=188)
+ expect_equal(as.numeric(count.aa(seq, type="DNA")), as.numeric(counts))
+})
More information about the CHNOSZ-commits
mailing list