Author: jedick
Date: 2024-02-28 02:24:53 +0100 (Wed, 28 Feb 2024)
New Revision: 828

Move read.fasta() and count.aa() to canprot

--- pkg/CHNOSZ/DESCRIPTION	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/DESCRIPTION	2024-02-28 01:24:53 UTC (rev 828)
@@ -1,6 +1,6 @@
-Date: 2024-02-11
+Date: 2024-02-28
 Package: CHNOSZ
-Version: 2.1.0
+Version: 2.1.0-1
 Title: Thermodynamic Calculations and Diagrams for Geochemistry
 Authors at R: c(
     person("Jeffrey", "Dick", , "j3ffdick at gmail.com", role = c("aut", "cre"),
@@ -9,7 +9,7 @@
 Author: Jeffrey Dick [aut, cre] (0000-0002-0687-5890)
 Maintainer: Jeffrey Dick <j3ffdick at gmail.com>
 Depends: R (>= 3.1.0)
-Suggests: tinytest, knitr, rmarkdown, tufte
+Suggests: tinytest, knitr, rmarkdown, tufte, canprot
 Imports: grDevices, graphics, stats, utils
 Description: An integrated set of tools for thermodynamic calculations in
   aqueous geochemistry and geobiochemistry. Functions are provided for writing

--- pkg/CHNOSZ/NAMESPACE	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/NAMESPACE	2024-02-28 01:24:53 UTC (rev 828)
@@ -9,8 +9,8 @@
   "describe.property", "describe.basis", "equilibrate",
   "aminoacids", "ZC.col",
   "pinfo", "protein.length", "protein.formula",
-  "read.fasta", "protein.basis", "add.protein",
-  "unitize", "seq2aa",
+  "protein.basis", "add.protein",
+  "unitize",
   "thermo.refs", "mod.OBIGT",
 # examples
   "examples", "demos", "mtitle",
@@ -20,7 +20,6 @@
   "mass", "entropy", "GHS", "water",
   "dPdTtr", "Ttr",
-  "count.aa",
   "rho.IAPWS95", "IAPWS95", "water.AW90", "WP02.auxiliary", "water.IAPWS95",
   "getrank", "parent", "sciname", "allparents", "getnodes", "getnames",
   "protein.OBIGT", "which.pmax",

Modified: pkg/CHNOSZ/R/add.protein.R
--- pkg/CHNOSZ/R/add.protein.R	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/R/add.protein.R	2024-02-28 01:24:53 UTC (rev 828)
@@ -2,22 +2,6 @@
 # Calculate properties of proteins 20061109 jmd
 # Reorganize protein functions 20120513
-# Count numbers of amino acids in a sequence
-seq2aa <- function(sequence, protein = NA) {
-  # Remove newlines and whitespace
-  sequence <- gsub("\\s", "", gsub("[\r\n]", "", sequence))
-  # Make a data frame from counting the amino acids in the sequence
-  caa <- count.aa(sequence)
-  colnames(caa) <- aminoacids(3)
-  # Now make the data frame
-  po <- strsplit(as.character(protein), "_")[[1]]
-  aa <- data.frame(protein = po[1], organism = po[2], ref = NA, abbrv = NA, stringsAsFactors = FALSE)
-  # chains = 1 for any sequence, chains = 0 for no sequence
-  chains <- sum(nchar(sequence) > 0)
-  aa <- cbind(aa, chains = chains, caa)
-  return(aa)
 # Add amino acid counts to thermo()$protein (returns iprotein)
 add.protein <- function(aa, as.residue = FALSE) {
   # Add a properly constructed data frame of 

Modified: pkg/CHNOSZ/inst/NEWS.Rd
--- pkg/CHNOSZ/inst/NEWS.Rd	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/inst/NEWS.Rd	2024-02-28 01:24:53 UTC (rev 828)
@@ -15,6 +15,15 @@
+\section{Changes in CHNOSZ version 2.1.0-1 (2024-02-28)}{
+    \itemize{
+      \item Move \code{read.fasta()} and \code{count.aa()} to canprot package.
+      \item Remove \code{seq2aa()}.
+    }
 \section{Changes in CHNOSZ version 2.1.0 (2024-02-11)}{
   \subsection{NEW FEATURES}{

Modified: pkg/CHNOSZ/inst/tinytest/test-add.protein.R
--- pkg/CHNOSZ/inst/tinytest/test-add.protein.R	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/inst/tinytest/test-add.protein.R	2024-02-28 01:24:53 UTC (rev 828)
@@ -12,7 +12,7 @@
 expect_equal(ip1, ip2, info = info)
 info <- "Errors and messages occur in some circumstances"
-expect_error(add.protein(count.aa("AAA")), "does not have the same columns as thermo\\(\\)\\$protein", info = info)
+expect_error(add.protein(canprot::count.aa("AAA")), "does not have the same columns as thermo\\(\\)\\$protein", info = info)
 expect_message(add.protein(pinfo(pinfo("CYC_BOVIN"))), "replaced 1 existing protein\\(s\\)", info = info)
 info <- "group additivity for proteins gives expected values"
@@ -33,12 +33,13 @@
 expect_equal(formula, lprop$formula, info = info)
 info <- "read.fasta() identifies sequences correctly and gives amino acid compositions in the correct format"
-ffile <- system.file("extdata/protein/EF-Tu.aln", package = "CHNOSZ")
-aa <- read.fasta(ffile)
-expect_equal(aa[1, ], read.fasta(ffile, 1), info = info)
+ffile <- system.file("extdata/protein/rubisco.fasta", package = "CHNOSZ")
+aa <- canprot::read.fasta(ffile)
+expect_equal(aa[1, ], canprot::read.fasta(ffile, 1), info = info)
 # Use unlist here so that different row names are not compared
-expect_equal(unlist(aa[8, ]), unlist(read.fasta(ffile, 8)), info = info)
-expect_message(ip1 <- add.protein(aa), "added 8 new protein\\(s\\)", info = info)
-expect_message(ip2 <- add.protein(aa), "replaced 8 existing protein\\(s\\)", info = info)
+aa8 <- canprot::read.fasta(ffile, 1:8)
+expect_equal(unlist(aa[1:8, ]), unlist(aa8), info = info)
+expect_message(ip1 <- add.protein(aa8), "added 8 new protein\\(s\\)", info = info)
+expect_message(ip2 <- add.protein(aa8), "replaced 8 existing protein\\(s\\)", info = info)
 # add.protein should return the correct indices for existing proteins
 expect_equal(ip1, ip2, info = info)

Modified: pkg/CHNOSZ/man/CHNOSZ-package.Rd
--- pkg/CHNOSZ/man/CHNOSZ-package.Rd	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/CHNOSZ-package.Rd	2024-02-28 01:24:53 UTC (rev 828)
@@ -24,7 +24,7 @@
   \item Thermodynamic data: \code{\link{data}}, \code{\link{extdata}}, \code{\link{add.OBIGT}}, \code{\link{util.data}}
   \item Thermodynamic calculations: \code{\link{util.formula}}, \code{\link{makeup}}, \code{\link{util.units}}, \code{\link{Berman}}, \code{\link{nonideal}}, \code{\link{util.misc}}
   \item Water properties: \code{\link{water}}, \code{\link{util.water}}, \code{\link{DEW}}, \code{\link{IAPWS95}}
-  \item Protein properties: \code{\link{protein.info}}, \code{\link{add.protein}}, \code{\link{util.fasta}}, \code{\link{util.protein}}, \code{\link{util.seq}}, \code{\link{ionize.aa}}
+  \item Protein properties: \code{\link{protein.info}}, \code{\link{add.protein}}, \code{\link{util.protein}}, \code{\link{util.seq}}, \code{\link{ionize.aa}}
   \item Other tools: \code{\link{examples}}, \code{\link{taxonomy}}
   \item Utility functions: \code{\link{util.expression}}, \code{\link{util.plot}}, \code{\link{util.array}}, \code{\link{util.list}}, \code{\link{palply}}

Modified: pkg/CHNOSZ/man/add.protein.Rd
--- pkg/CHNOSZ/man/add.protein.Rd	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/add.protein.Rd	2024-02-28 01:24:53 UTC (rev 828)
@@ -1,7 +1,6 @@
 \title{Amino Acid Compositions of Proteins}
@@ -10,7 +9,6 @@
   add.protein(aa, as.residue = FALSE)
-  seq2aa(sequence, protein = NA)
   aasum(aa, abundance = 1, average = FALSE, protein = NULL, organism = NULL)
@@ -17,7 +15,6 @@
   \item{aa}{data frame, amino acid composition in the format of \code{thermo()$protein}}
   \item{as.residue}{logical, normalize by protein length?}
-  \item{sequence}{character, protein sequence}
   \item{protein}{character, name of protein; numeric, indices of proteins (rownumbers of \code{\link{thermo}()$protein})}
   \item{abundance}{numeric, abundances of proteins}
   \item{average}{logical, return the weighted average of amino acid counts?}
@@ -31,11 +28,7 @@
 The purpose of the functions described here is to identify proteins and work with their amino acid compositions.
 From the amino acid compositions, the thermodynamic properties of the proteins can be estimated by group additivity.
-\code{seq2aa} returns a data frame of amino acid composition for the provided \code{sequence}, in the format of \code{thermo()$protein}.
-In this function, the value of the \code{protein} argument is put into the \code{protein} column of the result.
-If there is an underscore (e.g. \samp{LYSC_CHICK}), it is used to split the text, and the two parts are put into the \code{protein} and \code{organism} columns.
-Given amino acid compositions returned by \code{seq2aa}, \code{add.protein} adds them to \code{thermo()$protein} for use by other functions in CHNOSZ.
+Given a data frame of amino acid compositions in the format of \code{thermo()$protein}, \code{add.protein} adds them to \code{thermo()$protein} for use by other functions in CHNOSZ.
 The amino acid compositions of proteins in \code{aa} with the same name as one in \code{thermo()$protein} are replaced.
 Set \code{as.residue} to TRUE to normalize by protein length; each input amino acid composition is divided by the corresponding number of residues, with the result that the sum of amino acid frequencies for each protein is 1.
@@ -46,7 +39,6 @@
-For \code{seq2aa}, a data frame of amino acid composition and identifying information for proteins.
 For \code{add.protein}, the rownumbers of \code{thermo()$protein} that are added and/or replaced.
 For \code{aasum}, a one-row data frame of amino acid composition and identifying information.
@@ -53,27 +45,22 @@
-# Get the amino acid composition of a protein sequence
-# (Human Gastric juice peptide 1)
-aa <- seq2aa("LAAGKVEDSD", "GAJU_HUMAN")
-# Add the protein to CHNOSZ
-ip <- add.protein(aa)
-# Calculate the protein length and chemical formula
-protein.length(ip) # 10
-as.chemical.formula(protein.formula(ip)) # "C41H69N11O18"
+# Read a file with the amino acid compositions of several poliovirus protein subunits
+file <- system.file("extdata/protein/POLG.csv", package = "CHNOSZ")
+aa <- read.csv(file)
-# Calculate a formula without using add.protein
-aa <- seq2aa("ANLSG", "pentapeptide_test")
+# Add the proteins to CHNOSZ
+iprotein <- add.protein(aa)
+# Calculate length and elemental formula
-# Sum the amino acid compositions of several poliovirus protein subunits
-file <- system.file("extdata/protein/POLG.csv", package = "CHNOSZ")
-aa <- read.csv(file, as.is = TRUE)
+# Sum the amino acid compositions and assign a new protein name
 aasum(aa, protein = "POLG_sum")
-\code{\link{read.fasta}} for another way of getting amino acid compositions that can be used with \code{add.protein}.
+\code{\link[canprot]{read.fasta}} for reading amino acid compositions from FASTA files.
 \code{\link{pinfo}} for protein-level functions (length, chemical formulas, reaction coefficients of basis species).

Modified: pkg/CHNOSZ/man/extdata.Rd
--- pkg/CHNOSZ/man/extdata.Rd	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/extdata.Rd	2024-02-28 01:24:53 UTC (rev 828)
@@ -39,7 +39,6 @@
   Files in \code{protein} contain protein sequences and amino acid compositions for proteins.
-    \item \code{EF-Tu.aln} consists of aligned sequences (394 amino acids) of elongation factor Tu (EF-Tu). The sequences correspond to those taken from UniProtKB for ECOLI (\emph{Escherichia coli}), THETH (\emph{Thermus thermophilus}) and THEMA (\emph{Thermotoga maritima}), and reconstructed ancestral sequences taken from Gaucher et al., 2003 (maximum likelihood bacterial stem and mesophilic bacterial stem, and alternative bacterial stem). See \code{\link{read.fasta}} for an example that uses this file.
     \item \code{rubisco.fasta} Sequences of Rubisco obtained from UniProt (see Dick, 2014). See the vignette \viglink{anintro} for an example that uses this file.
     \item \code{POLG.csv}
       Amino acid compositions of a few proteins used for some tests and examples.
@@ -81,8 +80,6 @@
 Gattiker, A., Michoud, K., Rivoire, C., Auchincloss, A. H., Coudert, E., Lima, T., Kersey, P., Pagni, M., Sigrist, C. J. A., Lachaize, C., Veuthey, A.-L., Gasteiger, E. and Bairoch, A. (2003) Automatic annotation of microbial proteomes in Swiss-Prot. \emph{Comput. Biol. Chem.} \bold{27}, 49--58. \doi{10.1016/S1476-9271(02)00094-4}
-Gaucher, E. A., Thomson, J. M., Burgan, M. F. and Benner, S. A (2003) Inferring the palaeoenvironment of ancient bacteria on the basis of resurrected proteins. \emph{Nature} \bold{425}(6955), 285--288. \doi{10.1038/nature01977}
 Helgeson, H. C., Delany, J. M., Nesbitt, H. W. and Bird, D. K. (1978) Summary and critique of the thermodynamic properties of rock-forming minerals. \emph{Am. J. Sci.} \bold{278-A}, 1--229. \url{https://www.worldcat.org/oclc/13594862}
 Hnědkovský, L., Wood, R. H. and Majer, V. (1996) Volumes of aqueous solutions of \CH4, \CO2, \H2S, and \NH3 at temperatures from 298.15 K to 705 K and pressures to 35 MPa. \emph{J. Chem. Thermodyn.} \bold{28}, 125--142. \doi{10.1006/jcht.1996.0011}

Modified: pkg/CHNOSZ/man/palply.Rd
--- pkg/CHNOSZ/man/palply.Rd	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/palply.Rd	2024-02-28 01:24:53 UTC (rev 828)
@@ -25,7 +25,7 @@
-\code{\link{read.fasta}}, \code{\link{count.aa}}, \code{\link{affinity}}, \code{\link{equil.boltzmann}} and \code{\link{equil.reaction}} for functions that use \code{palply}.
+\code{\link{affinity}}, \code{\link{equil.boltzmann}} and \code{\link{equil.reaction}} for functions that use \code{palply}.
 Tests are in \file{tests/test-util.program.R}, and a \dQuote{real world} example is in \file{demo/density.R}.

Modified: pkg/CHNOSZ/man/rank.affinity.Rd
--- pkg/CHNOSZ/man/rank.affinity.Rd	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/rank.affinity.Rd	2024-02-28 01:24:53 UTC (rev 828)
@@ -41,7 +41,7 @@
 datfile <- system.file("extdata/cpetc/rubisco.csv", package = "CHNOSZ")
 fastafile <- system.file("extdata/protein/rubisco.fasta", package = "CHNOSZ")
 dat <- read.csv(datfile)
-aa <- read.fasta(fastafile)
+aa <- canprot::read.fasta(fastafile)
 groups <- sapply(c("A", "B", "E"), "==", dat$domain, simplify = FALSE)
 names(groups) <- c("Archaea", "Bacteria", "Eukaryota")
 ip <- add.protein(aa, as.residue = TRUE)

Modified: pkg/CHNOSZ/man/util.seq.Rd
--- pkg/CHNOSZ/man/util.seq.Rd	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/util.seq.Rd	2024-02-28 01:24:53 UTC (rev 828)
@@ -25,11 +25,13 @@
-\seealso{\code{\link{count.aa}} for counting amino acids or nucleic-acid bases in a sequence; \code{\link{protein.formula}} for calculating the chemical formulas of proteins.}
+  \code{\link{protein.formula}} for calculating the chemical formulas of proteins.
-## Count nucleobases in a sequence
-bases <- count.aa("ACCGGGTTT", type = "DNA")
+# Three-letter abbreviations of amino acids
 \concept{Protein properties}

Modified: pkg/CHNOSZ/src/init.c
--- pkg/CHNOSZ/src/init.c	2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/src/init.c	2024-02-28 01:24:53 UTC (rev 828)
@@ -13,19 +13,11 @@
     {NULL, NULL, 0}
-/* .C calls */
-extern void count_letters(void *, void *);
-static const R_CMethodDef CEntries[] = {
-    {"count_letters", (DL_FUNC) &count_letters, 2},
-    {NULL, NULL, 0}
 void R_init_CHNOSZ(DllInfo *dll)
+    R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL);
     R_registerRoutines(dll, CEntries, NULL, FortranEntries, NULL);
-    R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL);
     R_registerRoutines(dll, CEntries, NULL, NULL, NULL);

