From noreply at r-forge.r-project.org Sat Feb 17 18:47:07 2018 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 17 Feb 2018 18:47:07 +0100 (CET) Subject: [CHNOSZ-commits] r304 - in pkg/CHNOSZ: . R inst man src tests/testthat Message-ID: <20180217174707.C7AA7189C93@r-forge.r-project.org> Author: jedick Date: 2018-02-17 18:47:07 +0100 (Sat, 17 Feb 2018) New Revision: 304 Added: pkg/CHNOSZ/src/count_letters.c Modified: pkg/CHNOSZ/DESCRIPTION pkg/CHNOSZ/NAMESPACE pkg/CHNOSZ/R/util.fasta.R pkg/CHNOSZ/R/water.R pkg/CHNOSZ/inst/NEWS pkg/CHNOSZ/man/util.fasta.Rd pkg/CHNOSZ/src/init.c pkg/CHNOSZ/tests/testthat/test-util.seq.R Log: count.aa(): improve performance by using C code (src/count_letters.c) Modified: pkg/CHNOSZ/DESCRIPTION =================================================================== --- pkg/CHNOSZ/DESCRIPTION 2018-01-28 08:22:34 UTC (rev 303) +++ pkg/CHNOSZ/DESCRIPTION 2018-02-17 17:47:07 UTC (rev 304) @@ -1,6 +1,6 @@ -Date: 2018-01-28 +Date: 2018-02-18 Package: CHNOSZ -Version: 1.1.3-11 +Version: 1.1.3-12 Title: Thermodynamic Calculations for Geobiochemistry Authors at R: c( person("Jeffrey", "Dick", , "j3ffdick at gmail.com", role = c("aut", "cre"), Modified: pkg/CHNOSZ/NAMESPACE =================================================================== --- pkg/CHNOSZ/NAMESPACE 2018-01-28 08:22:34 UTC (rev 303) +++ pkg/CHNOSZ/NAMESPACE 2018-02-17 17:47:07 UTC (rev 304) @@ -1,7 +1,3 @@ -## Export all names except those beginning with F_ -## (so we don't export F_h2o92 created below) -#exportPattern("^([^F]|F($|[^_])).*") - # exports starting with functions used in vignettes, examples, demos, tests 20170224 export( # anintro vignette @@ -64,7 +60,8 @@ ) # Load shared objects -useDynLib(CHNOSZ, .registration = TRUE, .fixes = "F_") +# Refer to all C/Fortran routines by their name prefixed by C_ +useDynLib(CHNOSZ, .registration = TRUE, .fixes = "C_") # Imports from default packages importFrom("grDevices", "dev.cur", "dev.off", "extendrange", Modified: pkg/CHNOSZ/R/util.fasta.R =================================================================== --- pkg/CHNOSZ/R/util.fasta.R 2018-01-28 08:22:34 UTC (rev 303) +++ pkg/CHNOSZ/R/util.fasta.R 2018-02-17 17:47:07 UTC (rev 304) @@ -14,28 +14,24 @@ # fas: fasta entry # value of 'id' is used for 'protein' in output table, # otherwise ID is parsed from FASTA header (can take a while) - is.nix <- Sys.info()[[1]]=="Linux" - if(is.nix & is.null(lines)) { - message("read.fasta: reading ",basename(file)) - # figure out whether to use 'cat', 'zcat' or 'xzcat' - suffix <- substr(file,nchar(file)-2,nchar(file)) - if(suffix==".gz") mycat <- "zcat" - else if(suffix==".xz") mycat <- "xzcat" - else mycat <- "cat" - nlines <- as.integer(system(paste(mycat,' "',file,'" | wc -l',sep=""),intern=TRUE)) - ihead <- as.integer(system(paste(mycat,' "',file,'" | grep -n "^>" | cut -f 1 -d ":"',sep=""),intern=TRUE)) - #linefun <- function(i1,i2) as.character(system(paste('sed -n ',i1,',',i2,'p ',file,sep=""),intern=TRUE)) - lines <- system(paste(mycat,' "',file,'"',sep=""),intern=TRUE) - linefun <- function(i1,i2) lines[i1:i2] - } else { - if(is.null(lines)) { - lines <- readLines(file) - message("read.fasta: reading ",basename(file)) - } - nlines <- length(lines) - if(is.null(ihead)) ihead <- which(substr(lines,1,1)==">") - linefun <- function(i1,i2) lines[i1:i2] + if(is.null(lines)) { + message("read.fasta: reading ", basename(file), " ... ", appendLF=FALSE) + #lines <- readLines(file) + is.nix <- Sys.info()[[1]]=="Linux" + if(is.nix) { + # figure out whether to use 'cat', 'zcat' or 'xzcat' + suffix <- substr(file,nchar(file)-2,nchar(file)) + if(suffix==".gz") mycat <- "zcat" + else if(suffix==".xz") mycat <- "xzcat" + else mycat <- "cat" + lines <- system(paste(mycat,' "',file,'"',sep=""),intern=TRUE) + } else lines <- scan(file, what=character(), sep="\n", quiet=TRUE) } + nlines <- length(lines) + message(nlines, " lines ... ", appendLF=FALSE) + if(is.null(ihead)) ihead <- which(substr(lines,1,1)==">") + message(length(ihead), " sequence headers") + linefun <- function(i1,i2) lines[i1:i2] # identify the lines that begin and end each sequence begin <- ihead + 1 end <- ihead - 1 @@ -143,10 +139,13 @@ count.aa <- function(seq, start=NULL, stop=NULL, type="protein") { # count amino acids or DNA bases in one or more sequences given as elements of the list seq - # put them in alphabetical order (amino acids: same order as in thermo$protein) if(type=="protein") letts <- aminoacids(1) else if(type=="DNA") letts <- c("A", "C", "G", "T") else stop(paste("unknown sequence type", type)) + # the numerical positions of the letters in alphabetical order (i.e. for amino acids, same order as in thermo$protein) + ilett <- match(letts, LETTERS) + # the letters A-Z represented by raw values + rawAZ <- charToRaw("ABCDEFGHIJKLMNOPQRSTUVWXYZ") # to count the letters in each sequence countfun <- function(seq, start, stop) { # get a substring if one or both of start or stop are given @@ -157,24 +156,27 @@ } else if(!is.null(stop)) { seq <- substr(seq, 1, stop) } - # the actual counting - nnn <- table(strsplit(toupper(seq), "")[[1]]) - # get them in alphabetical order - ilett <- match(names(nnn), letts) - # in case any letters aren't in our alphabet - ina <- is.na(ilett) - if(any(ina)) message(paste("count.aa: unrecognized letter(s) in", type, "sequence:", - paste(names(nnn)[ina], collapse=" "))) - count <- numeric(length(letts)) - count[ilett[!ina]] <- nnn[!ina] - return(count) + ## the actual counting ... + #nnn <- table(strsplit(toupper(seq), "")[[1]]) + # ... replaced with C version 20180217 + counts <- .C(C_count_letters, seq, integer(26))[[2]] + # which is equivalent to this R code: + #rawseq <- charToRaw(toupper(seq)) + #counts <- sapply(rawAZ, function(x) sum(rawseq == x)) + return(counts) } # counts for each sequence - a <- palply("", seq, countfun, start, stop) - a <- t(as.data.frame(a, optional=TRUE)) + counts <- palply("", seq, countfun, start, stop) + counts <- do.call(rbind, counts) + # check for letters that aren't in our alphabet + ina <- colSums(counts[, -ilett, drop=FALSE]) > 0 + if(any(ina)) { + message(paste("count.aa: unrecognized letter(s) in", type, "sequence:", paste(LETTERS[-ilett][ina], collapse=" "))) + } + counts <- counts[, ilett, drop=FALSE] # clean up row/column names - colnames(a) <- letts - rownames(a) <- 1:nrow(a) - return(a) + colnames(counts) <- letts + rownames(counts) <- 1:nrow(counts) + return(counts) } Modified: pkg/CHNOSZ/R/water.R =================================================================== --- pkg/CHNOSZ/R/water.R 2018-01-28 08:22:34 UTC (rev 303) +++ pkg/CHNOSZ/R/water.R 2018-02-17 17:47:07 UTC (rev 304) @@ -82,7 +82,7 @@ rho.out[i] <- NA } else { # now to the actual calculations - H2O <- .Fortran(F_h2o92, as.integer(specs), as.double(states), + H2O <- .Fortran(C_h2o92, as.integer(specs), as.double(states), as.double(rep(0, 46)), as.integer(0)) # errors err.out[i] <- H2O[[4]] Modified: pkg/CHNOSZ/inst/NEWS =================================================================== --- pkg/CHNOSZ/inst/NEWS 2018-01-28 08:22:34 UTC (rev 303) +++ pkg/CHNOSZ/inst/NEWS 2018-02-17 17:47:07 UTC (rev 304) @@ -1,4 +1,4 @@ -CHANGES IN CHNOSZ 1.1.3-11 (2018-01-28) +CHANGES IN CHNOSZ 1.1.3-12 (2018-02-18) --------------------------------------- - Lines in 1-D diagram()s can optionally be drawn as splines using the @@ -25,9 +25,12 @@ - In equilibrate(), accept a length > 1 'normalize' argument in order normalize the chemical formulas of only the selected species. -- Export thermo.axis(), as it is useful for adding major&minor tick +- Export thermo.axis(), as it is useful for adding major and minor tick marks after (above) other plot elements such as legends. +- Add C implementation of counting occurrences of all letters in a + string (src/count_letters.c) to speed up operation of count.aa(). + CHANGES IN CHNOSZ 1.1.3 (2017-11-13) ------------------------------------ Modified: pkg/CHNOSZ/man/util.fasta.Rd =================================================================== --- pkg/CHNOSZ/man/util.fasta.Rd 2018-01-28 08:22:34 UTC (rev 303) +++ pkg/CHNOSZ/man/util.fasta.Rd 2018-02-17 17:47:07 UTC (rev 304) @@ -36,7 +36,6 @@ Use \code{iseq} to select the sequences to read (the default is all sequences). The function returns various formats depending on the value of \code{ret}. The default \samp{count} returns a data frame of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences). -This function utilizes the OS's \samp{grep} on supported operating systems in order to identify the header lines as well as \samp{cat} to read the file, otherwise \code{\link{readLines}} and \R's \code{\link{substr}} are used to read the file and locate the header lines. If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}. Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to ""). When \code{ret} is \samp{count}, the names of the proteins in the resulting data frame are parsed from the header lines of the file, unless \code{id} is provided. @@ -44,7 +43,7 @@ \code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}). For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}. -Letters are matched without regard for case. +The matching of letters is case-insensitive. A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid or nucleobase abbreviations. \code{start} and/or \code{stop} can be provided to count a fragment of the sequence (extracted using \code{\link{substr}}). If only one of \code{start} or \code{stop} is present, the other defaults to 1 (\code{start}) or the length of the sequence (\code{stop}). @@ -52,7 +51,6 @@ \code{uniprot.aa} returns a data frame of amino acid composition, in the format of \code{thermo$protein}, retrieved from the protein sequence if it is available from UniProt (\url{http://uniprot.org}). The \code{protein} argument corresponds to the \samp{Entry name} on the UniProt search pages. - } \value{ Added: pkg/CHNOSZ/src/count_letters.c =================================================================== --- pkg/CHNOSZ/src/count_letters.c (rev 0) +++ pkg/CHNOSZ/src/count_letters.c 2018-02-17 17:47:07 UTC (rev 304) @@ -0,0 +1,20 @@ +#include +#include +#include + +/* [20180217] adapted from https://stackoverflow.com/questions/13213422/count-the-number-of-occurrences-of-each-letter-in-string */ + +void count_letters(char **chararray, int *counts) { + + char *str = chararray[0]; + + int i; + size_t len = strlen(str); + + for (i = 0; i < len; i++) { + char c = str[i]; + if (!isalpha(c)) continue; + counts[(int)(tolower(c) - 'a')]++; + } + +} Modified: pkg/CHNOSZ/src/init.c =================================================================== --- pkg/CHNOSZ/src/init.c 2018-01-28 08:22:34 UTC (rev 303) +++ pkg/CHNOSZ/src/init.c 2018-02-17 17:47:07 UTC (rev 304) @@ -10,8 +10,21 @@ {NULL, NULL, 0} }; +/* .C calls */ +extern void count_letters(void *, void *); + +static const R_CMethodDef CEntries[] = { + {"count_letters", (DL_FUNC) &count_letters, 2}, + {NULL, NULL, 0} +}; + void R_init_CHNOSZ(DllInfo *dll) { + R_registerRoutines(dll, CEntries, NULL, FortranEntries, NULL); +/* R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL); + R_registerRoutines(dll, CEntries, NULL, NULL, NULL); +*/ R_useDynamicSymbols(dll, FALSE); } + Modified: pkg/CHNOSZ/tests/testthat/test-util.seq.R =================================================================== --- pkg/CHNOSZ/tests/testthat/test-util.seq.R 2018-01-28 08:22:34 UTC (rev 303) +++ pkg/CHNOSZ/tests/testthat/test-util.seq.R 2018-02-17 17:47:07 UTC (rev 304) @@ -17,3 +17,9 @@ # ACG -> UGC (RNA complement) expect_equal(nucleic.formula(nucleic.complement(dna, "RNA")), "C13H14N10O4") }) + +test_that("count.aa() correctly processes a longer nucleobase sequence", { + seq <- "ATGTCCCGTTTCTTAGTTGCATTGGTTGCCGCACTTTTAGGAGTTGCAATTGAGATGTCCCTTCTCGTTCGCGCTCAGGGGCAGCAAACCTTGCTTTTGGCTGAAGAAAGCAAGCATTTGTCGCAATTGCGTCAACTGACTTTTGAAGGCACCAATGCCGAAGCGTATTGGTCGCCTGACGGGAAATGGTTGGTCTTTCAATCCACACGCCCACCTTACAAGGCTGACCAAATCTTCATCATGAGAGCGGATGGCTCGGGAGTTCGTGTCGTCAGCACGGGCAAAGGTCGTTGCACTTGTGCCTATTTCACGCCAGATGGCAAAGGCGTTATCTTTGCTACGACCCACCTTGCTGGACCAGAACCGCCGCAAGTGCCCAAACTGGACATTCCACGCTATGTTTGGGGCGTGTTCCCAAGTTACGAACTTTACCTGCGGCGTTTGGACACGATGGAACTTATCCGCTTGACCGATAACGAAGGCTACGACGCTGAAGCGACCATTTGCTGGAAGACTGGGCGAATTGTCTTCACAAGTTACCGCAATGGCGACCTTGACCTTTACAGCATGAAATTAGACGGCAGCGATTTGAAGCGATTGACGAAAACCATCGGCTACGAGGGCGGAGCGTTCTACTCGCCCGACGGGAAGCGGATTGTCTTCCGAGCCTATTTGCCAAAGACGCCTGACGAAATTGACGAATACAAGCGGTTGCTCCAGTTAGGCGTCATAAGCCCACCAAAGATGGAGTGGGTCGTCATGGACGCCGACGGTCGCAACATGAAGCAAATC" + counts <- data.frame(A=190, C=203, G=211, T=188) + expect_equal(as.numeric(count.aa(seq, type="DNA")), as.numeric(counts)) +}) From noreply at r-forge.r-project.org Tue Feb 27 02:20:38 2018 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 27 Feb 2018 02:20:38 +0100 (CET) Subject: [CHNOSZ-commits] r305 - in pkg/CHNOSZ: . R inst Message-ID: <20180227012038.655A0189DBB@r-forge.r-project.org> Author: jedick Date: 2018-02-27 02:20:37 +0100 (Tue, 27 Feb 2018) New Revision: 305 Modified: pkg/CHNOSZ/DESCRIPTION pkg/CHNOSZ/R/util.fasta.R pkg/CHNOSZ/inst/NEWS Log: read.fasta(): handle archive::archive_read input connections Modified: pkg/CHNOSZ/DESCRIPTION =================================================================== --- pkg/CHNOSZ/DESCRIPTION 2018-02-17 17:47:07 UTC (rev 304) +++ pkg/CHNOSZ/DESCRIPTION 2018-02-27 01:20:37 UTC (rev 305) @@ -1,6 +1,6 @@ -Date: 2018-02-18 +Date: 2018-02-27 Package: CHNOSZ -Version: 1.1.3-12 +Version: 1.1.3-13 Title: Thermodynamic Calculations for Geobiochemistry Authors at R: c( person("Jeffrey", "Dick", , "j3ffdick at gmail.com", role = c("aut", "cre"), Modified: pkg/CHNOSZ/R/util.fasta.R =================================================================== --- pkg/CHNOSZ/R/util.fasta.R 2018-02-17 17:47:07 UTC (rev 304) +++ pkg/CHNOSZ/R/util.fasta.R 2018-02-27 01:20:37 UTC (rev 305) @@ -14,11 +14,23 @@ # fas: fasta entry # value of 'id' is used for 'protein' in output table, # otherwise ID is parsed from FASTA header (can take a while) + + # check if the file is in an archive (https://github.com/jimhester/archive) + if("archive_read" %in% class(file)) { + is.archive <- TRUE + filebase <- gsub("]", "", basename(summary(file)$description)) + } else { + is.archive <- FALSE + filebase <- basename(file) + } if(is.null(lines)) { - message("read.fasta: reading ", basename(file), " ... ", appendLF=FALSE) - #lines <- readLines(file) + message("read.fasta: reading ", filebase, " ... ", appendLF=FALSE) is.nix <- Sys.info()[[1]]=="Linux" - if(is.nix) { + if(is.archive) { + # we can't use scan here? + lines <- readLines(file) + } else if(is.nix) { + # retrieve contents using system command (seems slightly faster even than scan()) # figure out whether to use 'cat', 'zcat' or 'xzcat' suffix <- substr(file,nchar(file)-2,nchar(file)) if(suffix==".gz") mycat <- "zcat" @@ -49,7 +61,7 @@ sequences <- lapply(iseq, seqfun) # organism name is from file name # (basename minus extension) - bnf <- strsplit(basename(file),split=".",fixed=TRUE)[[1]][1] + bnf <- strsplit(filebase,split=".",fixed=TRUE)[[1]][1] organism <- bnf # protein/gene name is from header line for entry # (strip the ">" and go to the first space) @@ -60,7 +72,7 @@ # stop if the first character is not ">" # or the first two charaters are "> " if(substr(f1,1,1)!=">" | length(grep("^> ",f1)>0)) - stop(paste("file",basename(file),"line",j,"doesn't begin with FASTA header '>'.")) + stop(paste("file",filebase,"line",j,"doesn't begin with FASTA header '>'.")) # discard the leading '>' f2 <- substr(f1, 2, nchar(f1)) # keep everything before the first space Modified: pkg/CHNOSZ/inst/NEWS =================================================================== --- pkg/CHNOSZ/inst/NEWS 2018-02-17 17:47:07 UTC (rev 304) +++ pkg/CHNOSZ/inst/NEWS 2018-02-27 01:20:37 UTC (rev 305) @@ -1,4 +1,4 @@ -CHANGES IN CHNOSZ 1.1.3-12 (2018-02-18) +CHANGES IN CHNOSZ 1.1.3-13 (2018-02-27) --------------------------------------- - Lines in 1-D diagram()s can optionally be drawn as splines using the @@ -31,6 +31,9 @@ - Add C implementation of counting occurrences of all letters in a string (src/count_letters.c) to speed up operation of count.aa(). +- read.fasta(): add support for file connections created using + archive::archive_read (https://github.com/jimhester/archive). + CHANGES IN CHNOSZ 1.1.3 (2017-11-13) ------------------------------------