[CHNOSZ-commits] r167 - in pkg/CHNOSZ: . R demo inst man tests/testthat vignettes

Mon Feb 20 13:27:25 CET 2017

Author: jedick
Date: 2017-02-20 13:27:24 +0100 (Mon, 20 Feb 2017)
New Revision: 167

Added:
   pkg/CHNOSZ/R/util.protein.R
   pkg/CHNOSZ/man/util.protein.Rd
Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/add.protein.R
   pkg/CHNOSZ/R/info.R
   pkg/CHNOSZ/R/protein.info.R
   pkg/CHNOSZ/R/util.affinity.R
   pkg/CHNOSZ/demo/ionize.R
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/man/add.protein.Rd
   pkg/CHNOSZ/man/info.Rd
   pkg/CHNOSZ/man/ionize.aa.Rd
   pkg/CHNOSZ/man/protein.info.Rd
   pkg/CHNOSZ/tests/testthat/test-add.protein.R
   pkg/CHNOSZ/tests/testthat/test-ionize.aa.R
   pkg/CHNOSZ/tests/testthat/test-util.affinity.R
   pkg/CHNOSZ/vignettes/anintro.Rmd
Log:
merge ip2aa() with protein.info()


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/DESCRIPTION	2017-02-20 12:27:24 UTC (rev 167)
@@ -1,6 +1,6 @@
 Date: 2017-02-20
 Package: CHNOSZ
-Version: 1.0.8-56
+Version: 1.0.8-57
 Title: Chemical Thermodynamics and Activity Diagrams
 Author: Jeffrey Dick
 Maintainer: Jeffrey Dick <j3ffdick at gmail.com>

Modified: pkg/CHNOSZ/R/add.protein.R
===================================================================
--- pkg/CHNOSZ/R/add.protein.R	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/R/add.protein.R	2017-02-20 12:27:24 UTC (rev 167)
@@ -3,25 +3,11 @@
 # reorganize protein functions 20120513
 
 # add.protein - add amino acid counts to thermo$protein (returns iprotein)
-# ip2aa - select amino acid counts (data frame) from thermo$protein
 # aa2eos - perform group additivity calculations
 # seq2aa - calculate amino acid counts from a sequence
 # aasum - combine amino acid counts (sum, average, or weighted sum by abundance)
 # read.aa - read amino acid counts from a file
 
-ip2aa <- function(protein, organism=NULL, residue=FALSE) {
-  # return amino acid counts (rows from thermo$protein)
-  # or 'protein' if it is a data frame
-  if(is.data.frame(protein)) return(protein)
-  iprotein <- protein.info(protein, organism)
-  # drop NA matches
-  iprotein <- iprotein[!is.na(iprotein)]
-  out <- get("thermo")$protein[iprotein, ]
-  # compute per-residue counts
-  if(residue) out[, 5:25] <- out[, 5:25]/rowSums(out[, 6:25])
-  return(out)
-}
-
 aa2eos <- function(aa, state=get("thermo")$opt$state) {
   # display and return the properties of
   # proteins calculated from amino acid composition

Modified: pkg/CHNOSZ/R/info.R
===================================================================
--- pkg/CHNOSZ/R/info.R	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/R/info.R	2017-02-20 12:27:24 UTC (rev 167)
@@ -45,7 +45,7 @@
         # here we use a default state from thermo$opt$state
         if(is.null(state)) state <- thermo$opt$state
         # retrieve the amino acid composition
-        aa <- ip2aa(ip)
+        aa <- protein.info(ip)
         # add up protein properties
         eos <- aa2eos(aa, state)
         # the real assignment work 

Modified: pkg/CHNOSZ/R/protein.info.R
===================================================================
--- pkg/CHNOSZ/R/protein.info.R	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/R/protein.info.R	2017-02-20 12:27:24 UTC (rev 167)
@@ -1,99 +1,52 @@
-# CHNOSZ/util.protein.R
+# CHNOSZ/protein.info.R
+
 # calculate formulas and summarize properties of proteins
-# MP90.cp: additive heat capacity from groups of Makhatadze and Privalov, 1990
-# group.formulas: chemical makeup of the amino acid residues
 # protein.info: find rownumber in thermo$protein
 # protein.formula: chemical makeup of the indicated proteins
 # protein.length: lengths of the indicated proteins
 # protein.basis: coefficients of basis species in formation reactions of [ionized] proteins [residues]
 # protein.equil: step-by-step example of protein equilibrium calculation
 
-protein.info <- function(protein, organism=NULL) {
-  # find the rownumber(s) of thermo$protein that matches
-  # 'protein' numeric (the rownumber itself)
-  # 'protein' character, e.g. LYSC_CHICK
-  # 'protein' and 'organism', e.g. 'LYSC', 'CHICK'
+protein.info <- function(protein, organism=NULL, residue=FALSE) {
+  # return the `protein` (possibly per residue) for:
+  #   dataframe `protein`
+  # return the rownumber(s) of thermo$protein for:
+  #   character `protein`, e.g. LYSC_CHICK
+  #   character `protein` and `organism`, e.g. 'LYSC', 'CHICK'
+  # return the row(s) of thermo$protein (possibly per residue) for:
+  #   numeric `protein` (the rownumber itself)
   thermo <- get("thermo")
+  if(is.data.frame(protein)) out <- protein
   if(is.numeric(protein)) {
+    # drop NA matches to thermo$protein
     iproteins <- 1:nrow(thermo$protein)
     protein[!protein %in% iproteins] <- NA
-    iprotein <- protein
+    # get amino acid counts
+    out <- thermo$protein[protein, ]
+  }
+  if(is.data.frame(protein) | is.numeric(protein)) {
+    # compute per-residue counts if requested
+    if(residue) out[, 5:25] <- out[, 5:25]/rowSums(out[, 6:25])
   } else {
     # from here we'll search by protein/organism pairs
     tp.po <- paste(thermo$protein$protein, thermo$protein$organism, sep="_")
     if(is.null(organism)) my.po <- protein
     else my.po <- paste(protein, organism, sep="_")
     iprotein <- match(my.po, tp.po)
+    # tell the user about NA's
+    if(any(is.na(iprotein))) {
+      nNA <- sum(is.na(iprotein))
+      if(nNA==1) ptext <- "" else ptext <- "s"
+      message("iprotein: ", sum(is.na(iprotein)), " protein", ptext, " not matched")
+    }
+    out <- iprotein
   }
-  # tell the user about NA's
-  if(any(is.na(iprotein))) {
-    nNA <- sum(is.na(iprotein))
-    if(nNA==1) ptext <- "" else ptext <- "s"
-    message("iprotein: ", sum(is.na(iprotein)), " protein", ptext, " not matched")
-  }
-  return(iprotein)
+  out
 }
 
-MP90.cp <- function(protein, T) {
-  # T (temperature, degrees C), protein (name of protein)
-  # returns heat capacity of protein (kj/mol)
-  # using algorithm of makhatadze and privalov, 1990.
-  TMP <- c(5,25,50,75,100,125)
-  A.cp <- splinefun(TMP,c(175.7,166.7,156.2,144.7,134.6,124.1))
-  C.cp <- splinefun(TMP,c(225.4,237.6,250.8,260.7,268.2,276.1))
-  D.cp <- splinefun(TMP,c( 72.8, 89.0,106.2,124.5,140.7,154.3))
-  E.cp <- splinefun(TMP,c(168.3,179.0,192.0,203.7,211.4,217.8))
-  F.cp <- splinefun(TMP,c(395.7,383.0,370.3,358.4,348.3,339.6))
-  G.cp <- splinefun(TMP,c( 82.3, 78.0, 71.7, 66.4, 59.7, 53.9))
-  H.cp <- splinefun(TMP,c(205.7,179.6,177.2,179.6,187.1,196.8))
-  I.cp <- splinefun(TMP,c(406.8,402.3,397.1,390.8,386.0,380.8))
-  K.cp <- splinefun(TMP,c(328.8,332.5,334.0,337.5,339.4,343.6))
-  L.cp <- splinefun(TMP,c(385.9,381.7,377.8,372.9,369.4,365.5))
-  M.cp <- splinefun(TMP,c(197.1,175.9,158.1,150.3,148.1,143.9))
-  N.cp <- splinefun(TMP,c( 72.9, 88.8,109.8,125.2,140.5,154.2))
-  P.cp <- splinefun(TMP,c(214.6,177.7,152.3,142.8,135.6,130.1))
-  Q.cp <- splinefun(TMP,c(168.0,180.2,193.4,203.3,210.8,218.7))
-  R.cp <- splinefun(TMP,c(204.6,273.4,305.8,315.1,318.7,318.5))
-  S.cp <- splinefun(TMP,c( 75.6, 81.2, 85.7, 91.4, 97.3,102.1))
-  T.cp <- splinefun(TMP,c(194.2,184.5,182.2,186.5,199.0,216.2))
-  V.cp <- splinefun(TMP,c(324.6,314.4,305.0,294.7,285.7,269.6))
-  W.cp <- splinefun(TMP,c(471.2,458.5,445.8,433.9,423.8,415.1))
-  Y.cp <- splinefun(TMP,c(310.6,301.7,295.2,294.5,300.1,304.0))
-  AA.cp <- splinefun(TMP,c(-158.3,-90.4,-21.5,-32.3,-92.4,-150.0))
-  UPBB.cp <- splinefun(TMP,c(3.7,15.2,26.2,29.8,33.7,33.7))
-  cnew <- numeric()
-  for(i in 1:length(T)) {
-    Ti <- T[i]
-    cp <- c(A.cp(Ti),C.cp(Ti),D.cp(Ti),E.cp(Ti),F.cp(Ti),
-            G.cp(Ti),H.cp(Ti),I.cp(Ti),K.cp(Ti),L.cp(Ti),
-            M.cp(Ti),N.cp(Ti),P.cp(Ti),Q.cp(Ti),R.cp(Ti),
-            S.cp(Ti),T.cp(Ti),V.cp(Ti),W.cp(Ti),Y.cp(Ti))
-    # get the protein composition
-    tt <- ip2aa(protein)[,6:25]
-    cnew <- c(cnew, sum(cp * as.numeric(tt)) + sum(as.numeric(tt)) * UPBB.cp(Ti))
-  }
-  return(cnew)
-}
-
-group.formulas <- function() {
-  # return a matrix with chemical formulas of residues
-  # names of the sidechain groups
-  groups <- paste("[", aminoacids(3), "]", sep="")
-  # the indices of H2O, sidechain groups, and [UPBB]
-  ig <- suppressMessages(info(c("H2O", groups, "[UPBB]")))
-  # their formulas
-  A <- i2A(ig)
-  # add [UPBB] to the sidechain groups to get residues
-  out <- A[1:21,]
-  out[2:21,] <- t(t(A) + A[22,])[2:21,]
-  # make "H2O" not "water"
-  rownames(out)[1] <- "H2O"
-  return(out)
-}
-
 protein.formula <- function(protein, organism=NULL, residue=FALSE) {
   # return a matrix with chemical formulas of proteins
-  aa <- ip2aa(protein, organism)
+  aa <- protein.info(protein.info(protein, organism))
   rf <- group.formulas()
   out <- as.matrix(aa[, 5:25]) %*% as.matrix(rf)
   if(residue) out <- out / rowSums(aa[, 6:25])
@@ -103,7 +56,7 @@
 
 protein.length <- function(protein, organism=NULL) {
   # calculate the length(s) of proteins
-  aa <- ip2aa(protein, organism)
+  aa <- protein.info(protein.info(protein, organism))
   # use rowSums on the columns containing amino acid counts
   pl <- as.numeric(rowSums(aa[, 6:25]))
   return(pl)
@@ -114,7 +67,7 @@
   # to form proteins (possibly per normalized by length) listed in protein
   # 20120528 renamed protein.basis from residue.info ...
   # what are the elemental compositions of the proteins
-  aa <- ip2aa(protein)
+  aa <- protein.info(protein.info(protein))
   pf <- protein.formula(aa)
   # what are the coefficients of the basis species in the formation reactions
   sb <- species.basis(pf)
@@ -141,7 +94,7 @@
   message("protein.equil: temperature from argument is ", T, " degrees C")
   TK <- convert(T, "K")
   # get the amino acid compositions of the proteins
-  aa <- ip2aa(protein)
+  aa <- protein.info(protein.info(protein))
   # get some general information about the proteins
   pname <- paste(aa$protein, aa$organism, sep="_")
   plength <- protein.length(aa)

Modified: pkg/CHNOSZ/R/util.affinity.R
===================================================================
--- pkg/CHNOSZ/R/util.affinity.R	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/R/util.affinity.R	2017-02-20 12:27:24 UTC (rev 167)
@@ -420,7 +420,7 @@
   # initialize output list
   out <- vector("list", length(iprotein))
   # get aa from iprotein
-  aa <- ip2aa(iprotein)
+  aa <- protein.info(iprotein)
   # calculate the values of A/2.303RT as a function of T-P-pH
   A <- ionize.aa(aa=aa, property="A", T=TPpH$T, P=TPpH$P, pH=TPpH$pH)
   if(transect) {

Added: pkg/CHNOSZ/R/util.protein.R
===================================================================
--- pkg/CHNOSZ/R/util.protein.R	                        (rev 0)
+++ pkg/CHNOSZ/R/util.protein.R	2017-02-20 12:27:24 UTC (rev 167)
@@ -0,0 +1,60 @@
+# CHNOSZ/util.protein.R
+# MP90.cp: additive heat capacity from groups of Makhatadze and Privalov, 1990
+# group.formulas: chemical makeup of the amino acid residues
+
+MP90.cp <- function(protein, T) {
+  # T (temperature, degrees C), protein (name of protein)
+  # returns heat capacity of protein (kj/mol)
+  # using algorithm of makhatadze and privalov, 1990.
+  TMP <- c(5,25,50,75,100,125)
+  A.cp <- splinefun(TMP,c(175.7,166.7,156.2,144.7,134.6,124.1))
+  C.cp <- splinefun(TMP,c(225.4,237.6,250.8,260.7,268.2,276.1))
+  D.cp <- splinefun(TMP,c( 72.8, 89.0,106.2,124.5,140.7,154.3))
+  E.cp <- splinefun(TMP,c(168.3,179.0,192.0,203.7,211.4,217.8))
+  F.cp <- splinefun(TMP,c(395.7,383.0,370.3,358.4,348.3,339.6))
+  G.cp <- splinefun(TMP,c( 82.3, 78.0, 71.7, 66.4, 59.7, 53.9))
+  H.cp <- splinefun(TMP,c(205.7,179.6,177.2,179.6,187.1,196.8))
+  I.cp <- splinefun(TMP,c(406.8,402.3,397.1,390.8,386.0,380.8))
+  K.cp <- splinefun(TMP,c(328.8,332.5,334.0,337.5,339.4,343.6))
+  L.cp <- splinefun(TMP,c(385.9,381.7,377.8,372.9,369.4,365.5))
+  M.cp <- splinefun(TMP,c(197.1,175.9,158.1,150.3,148.1,143.9))
+  N.cp <- splinefun(TMP,c( 72.9, 88.8,109.8,125.2,140.5,154.2))
+  P.cp <- splinefun(TMP,c(214.6,177.7,152.3,142.8,135.6,130.1))
+  Q.cp <- splinefun(TMP,c(168.0,180.2,193.4,203.3,210.8,218.7))
+  R.cp <- splinefun(TMP,c(204.6,273.4,305.8,315.1,318.7,318.5))
+  S.cp <- splinefun(TMP,c( 75.6, 81.2, 85.7, 91.4, 97.3,102.1))
+  T.cp <- splinefun(TMP,c(194.2,184.5,182.2,186.5,199.0,216.2))
+  V.cp <- splinefun(TMP,c(324.6,314.4,305.0,294.7,285.7,269.6))
+  W.cp <- splinefun(TMP,c(471.2,458.5,445.8,433.9,423.8,415.1))
+  Y.cp <- splinefun(TMP,c(310.6,301.7,295.2,294.5,300.1,304.0))
+  AA.cp <- splinefun(TMP,c(-158.3,-90.4,-21.5,-32.3,-92.4,-150.0))
+  UPBB.cp <- splinefun(TMP,c(3.7,15.2,26.2,29.8,33.7,33.7))
+  cnew <- numeric()
+  for(i in 1:length(T)) {
+    Ti <- T[i]
+    cp <- c(A.cp(Ti),C.cp(Ti),D.cp(Ti),E.cp(Ti),F.cp(Ti),
+            G.cp(Ti),H.cp(Ti),I.cp(Ti),K.cp(Ti),L.cp(Ti),
+            M.cp(Ti),N.cp(Ti),P.cp(Ti),Q.cp(Ti),R.cp(Ti),
+            S.cp(Ti),T.cp(Ti),V.cp(Ti),W.cp(Ti),Y.cp(Ti))
+    # get the protein composition
+    tt <- protein.info(protein.info(protein))[,6:25]
+    cnew <- c(cnew, sum(cp * as.numeric(tt)) + sum(as.numeric(tt)) * UPBB.cp(Ti))
+  }
+  return(cnew)
+}
+
+group.formulas <- function() {
+  # return a matrix with chemical formulas of residues
+  # names of the sidechain groups
+  groups <- paste("[", aminoacids(3), "]", sep="")
+  # the indices of H2O, sidechain groups, and [UPBB]
+  ig <- suppressMessages(info(c("H2O", groups, "[UPBB]")))
+  # their formulas
+  A <- i2A(ig)
+  # add [UPBB] to the sidechain groups to get residues
+  out <- A[1:21,]
+  out[2:21,] <- t(t(A) + A[22,])[2:21,]
+  # make "H2O" not "water"
+  rownames(out)[1] <- "H2O"
+  return(out)
+}

Modified: pkg/CHNOSZ/demo/ionize.R
===================================================================
--- pkg/CHNOSZ/demo/ionize.R	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/demo/ionize.R	2017-02-20 12:27:24 UTC (rev 167)
@@ -1,5 +1,5 @@
 ## ionize.aa(): Contour plots of net charge and ionization properties of LYSC_CHICK
-aa <- ip2aa("LYSC_CHICK")
+aa <- protein.info("LYSC_CHICK")
 pH <- seq(0, 14, 0.2)
 T <- seq(0, 200, 2)
 val <- expand.grid(pH=pH, T=T)

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/inst/NEWS	2017-02-20 12:27:24 UTC (rev 167)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 1.0.8-56 (2017-02-20)
+CHANGES IN CHNOSZ 1.0.8-57 (2017-02-20)
 ---------------------------------------
 
 DOCUMENTATION:
@@ -121,6 +121,8 @@
 - Rename iprotein() to protein.info(), replacing the previous function
   of the same name.
 
+- Merge ip2aa() with protein.info().
+
 CHANGES IN CHNOSZ 1.0.8 (2016-05-28)
 ------------------------------------
 

Modified: pkg/CHNOSZ/man/add.protein.Rd
===================================================================
--- pkg/CHNOSZ/man/add.protein.Rd	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/man/add.protein.Rd	2017-02-20 12:27:24 UTC (rev 167)
@@ -1,5 +1,4 @@
 \name{add.protein}
-\alias{ip2aa}
 \alias{aa2eos}
 \alias{seq2aa}
 \alias{aasum}
@@ -11,7 +10,6 @@
 }
 
 \usage{
-  ip2aa(protein, organism=NULL, residue=FALSE)
   aa2eos(aa, state=get("thermo")$opt$state)
   seq2aa(protein, sequence)
   aasum(aa, abundance = 1, average = FALSE, protein = NULL, organism = NULL)
@@ -22,7 +20,6 @@
 \arguments{
   \item{protein}{character, name of protein; numeric, indices of proteins (rownumbers of \code{\link{thermo}$protein})}
   \item{organism}{character, name of organism}
-  \item{residue}{logical, compute per-residue counts?}
   \item{aa}{data frame, amino acid composition in the format of \code{thermo$protein}}
   \item{state}{character, physical state}
   \item{sequence}{character, protein sequence}
@@ -37,10 +34,8 @@
 
   Often, the names of proteins are sufficient to set up calculations using functions such as \code{\link{subcrt}} or \code{\link{species}}. The names of proteins in CHNOSZ are distinguished from those of other chemical species by having an underscore character ("_") that separates two identifiers, referred to as the \code{protein} and \code{organism} (but any other meaning can be attached to these names). An example is \samp{LYSC_CHICK}. 
 
-  The first few functions provide low-level operations:
+  The first function provides low-level operations:
 
-  \code{ip2aa} returns the row(s) of \code{thermo$protein} that match the supplied protein names, OR the protein indices found by \code{iprotin}. Set \code{residue} to TRUE to return the per-residue composition (i.e. amino acid composition of the protein divided by total number of residues). For this function only, if the \code{protein} argument is a data frame, it is returned unchanged, except for possibly the per-residue calculation.
-
   \code{aa2eos} calculates the thermodynamic properties and equations-of-state parameters for the completely nonionized proteins using group additivity with parameters taken from Dick et al., 2006 (aqueous proteins) and LaRowe and Dick, 2012 (crystalline proteins and revised aqueous methionine sidechain group). The return value is a data frame in the same format as \code{thermo$obigt}. \code{state} indicates the physical state for the parameters used in the calculation (\samp{aq} or \samp{cr}).
 
   The remaining functions are more likely to be called directly by the user:

Modified: pkg/CHNOSZ/man/info.Rd
===================================================================
--- pkg/CHNOSZ/man/info.Rd	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/man/info.Rd	2017-02-20 12:27:24 UTC (rev 167)
@@ -28,7 +28,7 @@
 \details{
   \code{info} is the primary function used for querying the thermodynamic database (\code{\link{thermo}$obigt}). In common usage, it is called recursively; first with a character value (or values) for \code{species} indicating the name(s) or formula(s) of the species of interest. The result of this call is a numeric value, which can be provided as an argument in a second call to \code{info} in order to retrieve a data frame of the thermodynamic properties of the species. For its work, \code{info} calls on the other functions that are described below, which unlike \code{info} all expect arguments with length=1.
 
-  \code{info.character} searches for matches of the indicated \code{species} to names, chemical formulas, and abbreviations (in the \samp{abbrv} column) in the thermodynamic database. If the text of the \code{species} is matched the index of that species is returned. If there are multiple matches for the \code{species} and \code{state} is NULL, the index of first match is returned. The order of entries in \code{thermo$obigt} is grouped by states in the order \samp{aq}, \samp{cr}, \samp{gas}, \samp{liq}, so for species in both aqueous and gaseous states the index of the aqueous species is returned, unless \code{state} is set to \samp{gas}. The two exceptions are species identified by \samp{O2} or \samp{oxygen} (which without any indicated \code{state} matches the gaseous species) and \samp{H2O} (which matches the liquid species even if the indicated state is \samp{aq}). Normally, if a species match can not be located, the function then looks for proteins with the name of \code{species} (using \code{\link{protein.info}}), computes its properties if found (\code{\link{ip2aa}}) and adds this to the thermodynamic database (\code{\link{mod.obigt}}). \code{check.protein} prevents the processing of proteins and is provided to avoid an infinite loop in the interaction with \code{mod.obigt}.
+  \code{info.character} searches for matches of the indicated \code{species} to names, chemical formulas, and abbreviations (in the \samp{abbrv} column) in the thermodynamic database. If the text of the \code{species} is matched the index of that species is returned. If there are multiple matches for the \code{species} and \code{state} is NULL, the index of first match is returned. The order of entries in \code{thermo$obigt} is grouped by states in the order \samp{aq}, \samp{cr}, \samp{gas}, \samp{liq}, so for species in both aqueous and gaseous states the index of the aqueous species is returned, unless \code{state} is set to \samp{gas}. The two exceptions are species identified by \samp{O2} or \samp{oxygen} (which without any indicated \code{state} matches the gaseous species) and \samp{H2O} (which matches the liquid species even if the indicated state is \samp{aq}). Normally, if a species match can not be located, the function then looks for proteins with the name of \code{species} (using \code{\link{protein.info}}), computes its properties if found (\code{\link{aa2eos}}) and adds this to the thermodynamic database (\code{\link{mod.obigt}}). \code{check.protein} prevents the processing of proteins and is provided to avoid an infinite loop in the interaction with \code{mod.obigt}.
 
   \code{info.character} has additional logic for dealing with proteins and with multiple matches for the \samp{cr} state. If the \code{state} is \samp{cr}, matches will be counted for states entered as \samp{cr1}, \samp{cr2} etc in the database, and all of the species indices will be returned. Note, however, that \code{info} only ever returns a single species index, which becomes NA in the case of multiple matches to \samp{cr}. This functionality of \code{info.character} is used in \code{\link{subcrt}} to handle minerals with phase transitions. 
 

Modified: pkg/CHNOSZ/man/ionize.aa.Rd
===================================================================
--- pkg/CHNOSZ/man/ionize.aa.Rd	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/man/ionize.aa.Rd	2017-02-20 12:27:24 UTC (rev 167)
@@ -32,7 +32,7 @@
 }
 
 \seealso{
-  The amino acid composition in \code{aa} can be generated using e.g. \code{\link{ip2aa}}. 
+  The amino acid composition in \code{aa} can be generated using \code{\link{protein.info}}.
 
   This function is called by \code{\link{A.ionization}} as part of a calculation of \code{\link{affinity}} if proteins are among the species of interest, \samp{H+} is in the basis, and \code{thermo$opt$ionize} is TRUE. 
 }
@@ -43,7 +43,7 @@
 # the rownumber of the protein in thermo$protein
 ip <- protein.info("LYSC_CHICK")
 # its amino acid composition
-aa <- ip2aa(ip)
+aa <- protein.info(ip)
 # additive charges of unfolded protein at 25, 100, 150 degrees C
 # as a function of pH
 pH <- seq(0, 14, 0.1)
@@ -72,7 +72,7 @@
 plot(T, Cp.nonion, xlab=axis.label("T"), type="l",
   ylab=axis.label("Cp"), ylim=c(5000, 8000))
 # Cp of ionization and ionized protein
-aa <- ip2aa("LYSC_CHICK")
+aa <- protein.info(protein.info("LYSC_CHICK"))
 for(pH in c(5, 9, 3)) {
   Cp.ionized <- Cp.nonion + ionize.aa(aa, "Cp", T=T, pH=pH)[, 1]
   lines(T, Cp.ionized, lty=2)

Modified: pkg/CHNOSZ/man/protein.info.Rd
===================================================================
--- pkg/CHNOSZ/man/protein.info.Rd	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/man/protein.info.Rd	2017-02-20 12:27:24 UTC (rev 167)
@@ -1,12 +1,10 @@
 \encoding{UTF-8}
 \name{protein.info}
-\alias{group.formulas}
 \alias{protein.formula}
 \alias{protein.length}
 \alias{protein.info}
 \alias{protein.basis}
 \alias{protein.equil}
-\alias{MP90.cp}
 \title{Summaries of Thermodynamic Properties of Proteins}
 
 \description{
@@ -14,13 +12,11 @@
 }
 
 \usage{
-  protein.info(protein, organism=NULL)
+  protein.info(protein, organism=NULL, residue=FALSE)
   protein.formula(protein, organism = NULL, residue = FALSE)
   protein.length(protein, organism = NULL)
   protein.basis(protein, T = 25, normalize = FALSE)
   protein.equil(protein, T=25, loga.protein = 0, digits = 4)
-  MP90.cp(protein, T)
-  group.formulas()
 }
 
 \arguments{
@@ -34,31 +30,32 @@
 }
 
 \details{
-  These functions accept \code{protein} (and optionally \code{organism}) in the same way as \code{\link{ip2aa}}, that is, as a protein name (optionally with the \code{organism} part separated), one or more row numbers in \code{thermo$protein} that can be identified using \code{\link{protein.info}}, or a data frame in the format of \code{thermo$protein}.
+For character \code{protein}, \code{protein.info} returns the rownumber(s) of \code{thermo$protein} that match the protein names.
+The names can be supplied in the single \code{protein} argument (with an underscore) or as individual \code{protein}s and \code{organism}s.
+Any protein not matched returns an NA and generates a message.
 
-  \code{protein.info} returns the rownumber(s) of \code{thermo$protein} that match the protein names. The names can be supplied in the single \code{protein} argument or as separated \code{protein}s and \code{organism}s (without the underscore). Any protein not matched returns an NA and generates a message.
+For numeric \code{protein}, \code{protein.info} returns the corresponding row(s) of \code{thermo$protein}.
+Set \code{residue} to TRUE to return the per-residue composition (i.e. amino acid composition of the protein divided by total number of residues).
 
-  \code{protein.formula} returns a stoichiometrix matrix representing the chemical formulas of the proteins that can be pased to e.g. \code{\link{mass}} or \code{\link{ZC}}. The amino acid compositions are multiplied by the output of \code{group.formulas} to generate the result. \code{group.formulas} returns the chemical formulas of each of the 20 common amino acid residues in proteins, as well as the terminal -H and -H (treated as the [H2O] group).
+For dataframe \code{protein}, \code{protein.info} returns it unchanged, except for possibly the per-residue calculation.
 
-  \code{protein.length} returns the lengths (number of amino acids) of the proteins.
+The following functions accept any specification of protein(s) described above for \code{protein.info}:
 
-  The following two functions depend on an existing definition of the basis species:
+\code{protein.formula} returns a stoichiometrix matrix representing the chemical formulas of the proteins that can be pased to e.g. \code{\link{mass}} or \code{\link{ZC}}.
+The amino acid compositions are multiplied by the output of \code{\link{group.formulas}} to generate the result. 
 
+\code{protein.length} returns the lengths (number of amino acids) of the proteins.
+
+The following functions also depend on an existing definition of the basis species:
+
 \code{protein.basis} calculates the numbers of the basis species (i.e. opposite of the coefficients in the formation reactions) that can be combined to form the composition of each of the proteins.
 The basis species must be present in \code{thermo$basis}, and if \samp{H+} is among the basis species, the ionization states of the proteins are included.
 The ionization state of the protein is calculated at the pH defined in \code{thermo$basis} and at the temperature specified by the \code{T} argument.
 If \code{normalize} is TRUE, the coefficients on the basis species are divided by the lengths of the proteins. 
 
   \code{protein.equil} produces a series of messages showing step-by-step a calculation of the chemical activities of proteins in metastable equilibrium. For the first protein, it shows the standard Gibbs energies of the reaction to form the nonionized protein from the basis species and of the ionization reaction of the protein (if \samp{H+} is in the basis), then the standard Gibbs energy/RT of the reaction to form the (possibly ionized) protein per residue. The per-residue values of \samp{logQstar} and \samp{Astar/RT} are also shown for the first protein. Equilibrium calculations are then performed, only if more than one protein is specified. This calculation applies the Boltzmann distribution to the calculation of the equilibrium degrees of formation of the residue equivalents of the proteins, then converts them to activities of proteins taking account of \code{loga.protein} and protein length. If the \code{protein} argument is numeric (indicating rownumbers in \code{thermo$protein}), the values of \samp{Astar/RT} are compared with the output of \code{\link{affinity}}, and those of the equilibrium degrees of formation of the residues and the chemical activities of the proteins with the output of \code{\link{diagram}}. If the values in any of these tests are are not \code{\link{all.equal}} an error is produced indicating a bug. 
-
-  \code{MP90.cp} takes \code{protein} (name of protein) and \code{T} (one or more temperatures in \eqn{^{\circ}}{°}C) and returns the additive heat capacity (J mol \eqn{^{-1}}{^-1}) of the unfolded protein using values of heat capacities of the residues taken from Makhatadze and Privalov, 1990. Those authors provided values of heat capacity at six points between 5 and 125 \eqn{^{\circ}}{°}C; this function interpolates (using \code{\link{splinefun}}) values at other temperatures.
 }
 
-\seealso{
-\code{\link{ionize.aa}} for an example that compares \code{MP90.cp} with heat capacities calculated in CHNOSZ at different temperatures and pHs.
-See also \code{demo{protein.equil}}.
-}
-
 \examples{\dontshow{data(thermo)}
 # search by name in thermo$protein
 ip1 <- protein.info("LYSC_CHICK")
@@ -68,9 +65,9 @@
 # two organisms with the same protein name
 ip3 <- protein.info("MYG", c("HORSE", "PHYCA"))
 # their amino acid compositions
-ip2aa(ip3)
+protein.info(ip3)
 # their thermodynamic properties by group additivity
-aa2eos(ip2aa(ip3))
+aa2eos(protein.info(ip3))
 
 # an example of an unrecognized protein name
 ip4 <- protein.info("MYGPHYCA")
@@ -80,13 +77,13 @@
 # index in thermo$protein
 ip <- protein.info("LYSC_CHICK")
 # amino acid composition
-ip2aa(ip)
+protein.info(ip)
 # length and chemical formula
 protein.length(ip)
 protein.formula(ip)
 # group additivity for thermodynamic properties and HKF equation-of-state
 # parameters of non-ionized protein
-aa2eos(ip2aa(ip))
+aa2eos(protein.info(ip))
 # calculation of standard thermodynamic properties
 # (subcrt uses the species name, not ip)
 subcrt("LYSC_CHICK")
@@ -103,7 +100,7 @@
 ## these are all the same
 protein.formula("P53_PIG")
 protein.formula(protein.info("P53_PIG"))
-protein.formula(ip2aa(protein.info("P53_PIG")))
+protein.formula(protein.info(protein.info("P53_PIG")))
 
 ## using protein.formula: average oxidation state of 
 ## carbon of proteins from different organisms (Dick, 2014)
@@ -138,8 +135,6 @@
 
 \references{
   Dick, J. M. (2014) Average oxidation state of carbon in proteins. \emph{J. R. Soc. Interface} \bold{11}, 20131095. \url{http://dx.doi.org/10.1098/rsif.2013.1095}
-
-  Makhatadze, G. I. and Privalov, P. L. (1990) Heat capacity of proteins. 1. Partial molar heat capacity of individual amino acid residues in aqueous solution: Hydration effect \emph{J. Mol. Biol.} \bold{213}, 375--384. \url{http://dx.doi.org/10.1016/S0022-2836(05)80197-4}
 }
 
 \concept{Protein thermodynamic modeling}

Added: pkg/CHNOSZ/man/util.protein.Rd
===================================================================
--- pkg/CHNOSZ/man/util.protein.Rd	                        (rev 0)
+++ pkg/CHNOSZ/man/util.protein.Rd	2017-02-20 12:27:24 UTC (rev 167)
@@ -0,0 +1,36 @@
+\encoding{UTF-8}
+\name{util.protein}
+\alias{group.formulas}
+\alias{MP90.cp}
+\title{Functions for Proteins (Other Calculations)}
+
+\description{
+  Return chemical formulas of groups in proteins, and calculate heat capacity using an additivity model from the literature.
+}
+
+\usage{
+  MP90.cp(protein, T)
+  group.formulas()
+}
+
+\arguments{
+  \item{protein}{proteins specified in any format usable by \code{\link{protein.info}}}
+  \item{T}{numeric, temperature in \eqn{^{\circ}}{°}C}
+}
+
+\details{
+\code{group.formulas} returns the chemical formulas of each of the 20 common amino acid residues in proteins, as well as the terminal -H and -H (treated as the [H2O] group).
+
+\code{MP90.cp} takes \code{protein} (name of protein) and \code{T} (one or more temperatures in \eqn{^{\circ}}{°}C) and returns the additive heat capacity (J mol \eqn{^{-1}}{^-1}) of the unfolded protein using values of heat capacities of the residues taken from Makhatadze and Privalov, 1990.
+Those authors provided values of heat capacity at six points between 5 and 125 \eqn{^{\circ}}{°}C; this function interpolates (using \code{\link{splinefun}}) values at other temperatures.
+}
+
+\seealso{
+\code{\link{ionize.aa}} for an example that compares \code{MP90.cp} with heat capacities calculated in CHNOSZ at different temperatures and pHs.
+}
+
+\references{
+  Makhatadze, G. I. and Privalov, P. L. (1990) Heat capacity of proteins. 1. Partial molar heat capacity of individual amino acid residues in aqueous solution: Hydration effect \emph{J. Mol. Biol.} \bold{213}, 375--384. \url{http://dx.doi.org/10.1016/S0022-2836(05)80197-4}
+}
+
+\concept{utilities}

Modified: pkg/CHNOSZ/tests/testthat/test-add.protein.R
===================================================================
--- pkg/CHNOSZ/tests/testthat/test-add.protein.R	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/tests/testthat/test-add.protein.R	2017-02-20 12:27:24 UTC (rev 167)
@@ -19,7 +19,7 @@
   expect_message(protein.info(c("LYSC_CHICK", "MYGPHYCA")), "1 protein not matched")
   expect_error(seq2aa("LYS_CHICK", "XXX"), "no characters match an amino acid")
   expect_error(add.protein(count.aa("AAA")), "not a data frame with the same columns as thermo\\$protein")
-  expect_message(add.protein(ip2aa(protein.info("CYC_BOVIN"))), "replaced 1 existing protein\\(s\\)")
+  expect_message(add.protein(protein.info(protein.info("CYC_BOVIN"))), "replaced 1 existing protein\\(s\\)")
 })
 
 test_that("group additivity for proteins gives expected values", {

Modified: pkg/CHNOSZ/tests/testthat/test-ionize.aa.R
===================================================================
--- pkg/CHNOSZ/tests/testthat/test-ionize.aa.R	2017-02-20 11:27:11 UTC (rev 166)
+++ pkg/CHNOSZ/tests/testthat/test-ionize.aa.R	2017-02-20 12:27:24 UTC (rev 167)
@@ -36,7 +36,7 @@
   Z.LYSC_CHICK.100 <- c(13.3, 7.8, -3.4, -15.2, -20.9, -20.9)
   Z.LYSC_CHICK.150 <- c(13.3, 7.1, -8.0, -20.0, -20.9, -20.9)
   Z.LYSC_CHICK.25_oxid <- c(13.5, 8.7, 7.6, 1.6, -6.2, -12.9)
-  aa <- ip2aa(protein.info("LYSC_CHICK"))
+  aa <- protein.info(protein.info("LYSC_CHICK"))
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/chnosz -r 167