[CHNOSZ-commits] r722 - in pkg/CHNOSZ: . R inst inst/tinytest man

Sat Apr 16 10:19:04 CEST 2022

Author: jedick
Date: 2022-04-16 10:19:03 +0200 (Sat, 16 Apr 2022)
New Revision: 722

Added:
   pkg/CHNOSZ/R/affinity_rank.R
   pkg/CHNOSZ/man/affinity_rank.Rd
Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/NAMESPACE
   pkg/CHNOSZ/R/add.protein.R
   pkg/CHNOSZ/R/diagram.R
   pkg/CHNOSZ/inst/NEWS.Rd
   pkg/CHNOSZ/inst/tinytest/test-affinity.R
   pkg/CHNOSZ/man/add.protein.Rd
   pkg/CHNOSZ/man/diagram.Rd
Log:
Add affinity_rank()


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2022-04-11 02:27:40 UTC (rev 721)
+++ pkg/CHNOSZ/DESCRIPTION	2022-04-16 08:19:03 UTC (rev 722)
@@ -1,6 +1,6 @@
-Date: 2022-04-11
+Date: 2022-04-16
 Package: CHNOSZ
-Version: 1.9.9-14
+Version: 1.9.9-15
 Title: Thermodynamic Calculations and Diagrams for Geochemistry
 Authors at R: c(
     person("Jeffrey", "Dick", , "j3ffdick at gmail.com", role = c("aut", "cre"),

Modified: pkg/CHNOSZ/NAMESPACE
===================================================================
--- pkg/CHNOSZ/NAMESPACE	2022-04-11 02:27:40 UTC (rev 721)
+++ pkg/CHNOSZ/NAMESPACE	2022-04-16 08:19:03 UTC (rev 722)
@@ -56,7 +56,9 @@
 # added 20200716 or later
   "mash", "mix", "rebalance",
 # added 20220324
-  "logB_to_OBIGT"
+  "logB_to_OBIGT",
+# added 20220416
+  "affinity_rank"
 )
 
 # Load shared objects

Modified: pkg/CHNOSZ/R/add.protein.R
===================================================================
--- pkg/CHNOSZ/R/add.protein.R	2022-04-11 02:27:40 UTC (rev 721)
+++ pkg/CHNOSZ/R/add.protein.R	2022-04-16 08:19:03 UTC (rev 722)
@@ -54,17 +54,22 @@
   return(out)
 }
 
-add.protein <- function(aa) {
-  # add a properly constructed data frame of 
+add.protein <- function(aa, as.residue = FALSE) {
+  # Add a properly constructed data frame of 
   # amino acid counts to thermo()$protein
   thermo <- get("thermo", CHNOSZ)
   if(!identical(colnames(aa), colnames(thermo$protein)))
     stop("'aa' does not have the same columns as thermo()$protein")
-  # find any protein IDs that are duplicated
-  po <- paste(aa$protein, aa$organism, sep="_")
+  # Normalize by protein length if as.residue = TRUE 20220416
+  if(as.residue) {
+    pl <- protein.length(aa)
+    aa[, 5:25] <- aa[, 5:25] / pl
+  }
+  # Find any protein IDs that are duplicated
+  po <- paste(aa$protein, aa$organism, sep = "_")
   ip <- pinfo(po)
   ipdup <- !is.na(ip)
-  # now we're ready to go
+  # Now we're ready to go
   tp.new <- thermo$protein
   if(!all(ipdup)) tp.new <- rbind(tp.new, aa[!ipdup, ])
   if(any(ipdup)) tp.new[ip[ipdup], ] <- aa[ipdup, ]
@@ -71,9 +76,9 @@
   rownames(tp.new) <- NULL
   thermo$protein <- tp.new
   assign("thermo", thermo, CHNOSZ)
-  # return the new rownumbers
+  # Return the new rownumbers
   ip <- pinfo(po)
-  # make some noise
+  # Make some noise
   if(!all(ipdup)) message("add.protein: added ", nrow(aa)-sum(ipdup), " new protein(s) to thermo()$protein")
   if(any(ipdup)) message("add.protein: replaced ", sum(ipdup), " existing protein(s) in thermo()$protein")
   return(ip)

Added: pkg/CHNOSZ/R/affinity_rank.R
===================================================================
--- pkg/CHNOSZ/R/affinity_rank.R	                        (rev 0)
+++ pkg/CHNOSZ/R/affinity_rank.R	2022-04-16 08:19:03 UTC (rev 722)
@@ -0,0 +1,27 @@
+# Calculate normalized sum of ranking of affinities for species in designated groups
+# 20220416 jmd first version
+
+affinity_rank <- function(aout, groups) {
+  # Put the affinities into matrix form
+  amat <- sapply(aout$values, as.numeric)
+  # Calculate ranks
+  # https://stackoverflow.com/questions/1412775/pmax-parallel-maximum-equivalent-for-rank-in-r
+  arank <- apply(amat, 1, rank)
+  # Get the normalized ranks for each group
+  grank <- sapply(groups, function(group) {
+    # Sum the ranks for this group and divide by number of species in the group
+    if(inherits(group, "logical")) n <- sum(group)
+    if(inherits(group, "integer")) n <- length(group)
+    colSums(arank[group, ]) / n
+  })
+  # Restore dims
+  dims <- dim(aout$values[[1]])
+  glist <- apply(grank, 2, "dim<-", dims, simplify = FALSE)
+  aout$values <- glist
+  # Rename species to group names (for use by diagram())
+  aout$species <- aout$species[1:length(groups), ]
+  aout$species$name <- names(groups)
+  # "Sign" the object with our function name
+  aout$fun <- "affinity_rank"
+  aout
+}

Modified: pkg/CHNOSZ/R/diagram.R
===================================================================
--- pkg/CHNOSZ/R/diagram.R	2022-04-11 02:27:40 UTC (rev 721)
+++ pkg/CHNOSZ/R/diagram.R	2022-04-16 08:19:03 UTC (rev 722)
@@ -44,10 +44,18 @@
   ## Check that eout is a valid object
   efun <- eout$fun
   if(length(efun)==0) efun <- ""
-  if(!(efun %in% c("affinity", "equilibrate") | grepl("solubilit", efun)))
-    stop("'eout' is not the output from affinity(), equilibrate(), solubility(), or solubilities()")
+  if(!(efun %in% c("affinity", "affinity_rank", "equilibrate") | grepl("solubilit", efun)))
+    stop("'eout' is not the output from one of these functions: affinity, affinity_rank, equilibrate, or solubility")
   # For solubilities(), default type is loga.balance 20210303
   if(grepl("solubilities", efun) & missing(type)) type <- "loga.balance"
+  # Check balance argument for affinity_rank() 20220416
+  if(efun == "affinity_rank") {
+    if(!identical(balance, 1)) {
+      if(!is.null(balance)) stop("balance = 1 or NULL is required for plotting output of affinity_rank()")
+      if(is.null(balance)) message("diagram: setting balance = 1 for plotting output of affinity_rank()")
+      balance <- 1
+    }
+  }
 
   ## 'type' can be:
   #    'auto'                - property from affinity() (1D) or maximum affinity (affinity 2D) (aout) or loga.equil (eout)

Modified: pkg/CHNOSZ/inst/NEWS.Rd
===================================================================
--- pkg/CHNOSZ/inst/NEWS.Rd	2022-04-11 02:27:40 UTC (rev 721)
+++ pkg/CHNOSZ/inst/NEWS.Rd	2022-04-16 08:19:03 UTC (rev 722)
@@ -31,9 +31,8 @@
   \subsection{OBIGT DATABASE}{
     \itemize{
 
-      \item Move aqueous CH\s{4} back to \file{organic_aq.csv}. Thanks to
-      Irucka Embry for flagging this. Also move gaseous CH\s{4} from
-      \file{inorganic_gas.csv} to \file{organic_gas.csv}.
+      \item Move aqueous CH\s{4} back to \file{organic_aq.csv} and move gaseous
+      CH\s{4} from \file{inorganic_gas.csv} to \file{organic_gas.csv}.
       
       \item Modify \code{OBIGT(no.organics = TRUE)} to prune the database of
       organic compounds \emph{except} CH\s{4}. The ability to remove hundreds
@@ -45,7 +44,7 @@
     }
   }
 
-  \subsection{NEW FEATURES}{
+  \subsection{NEW FEATURES: THERMODYNAMIC DATA}{
     \itemize{
 
       \item Add \code{logB_to_OBIGT()} to fit three thermodynamic parameters
@@ -63,6 +62,20 @@
     }
   }
 
+  \subsection{NEW FEATURES: PROTEINS}{
+    \itemize{
+
+      \item Add an \strong{as.residue} argument to \code{add.protein()} to
+      normalize amino acid compositions by protein length.
+
+      \item Add function \code{affinity_rank()} to calculate (sums of ranks of
+      affinities for species in designated groups) normalized by number of
+      species in each group. The output of the new function can be used by
+      \code{diagram()}.
+
+    }
+  }
+
   \subsection{REMOVED FEATURES}{
     \itemize{
 

Modified: pkg/CHNOSZ/inst/tinytest/test-affinity.R
===================================================================
--- pkg/CHNOSZ/inst/tinytest/test-affinity.R	2022-04-11 02:27:40 UTC (rev 721)
+++ pkg/CHNOSZ/inst/tinytest/test-affinity.R	2022-04-16 08:19:03 UTC (rev 722)
@@ -122,10 +122,6 @@
 # TODO: add comparison with results from loading proteins via species()
 
 info <- "affinity() for proteins (with/without 'iprotein') returns same value as in previous package versions"
-# our test case is CSG_HALJP because it has no methionine
-# (aqueous [Met] was updated in 0.9-9)
-# these values were calculated using versions 0.6, 0.8 and 0.9-7
-# (25 degrees C, 1 bar, basis species "CHNOS" or "CHNOS+")
 A.2303RT.nonionized <- -3795.297
 A.2303RT.ionized <- -3075.222
 # first for nonionized protein

Modified: pkg/CHNOSZ/man/add.protein.Rd
===================================================================
--- pkg/CHNOSZ/man/add.protein.Rd	2022-04-11 02:27:40 UTC (rev 721)
+++ pkg/CHNOSZ/man/add.protein.Rd	2022-04-16 08:19:03 UTC (rev 722)
@@ -9,7 +9,7 @@
 }
 
 \usage{
-  add.protein(aa)
+  add.protein(aa, as.residue = FALSE)
   seq2aa(protein, sequence)
   aasum(aa, abundance = 1, average = FALSE, protein = NULL, organism = NULL)
 }
@@ -16,6 +16,7 @@
 
 \arguments{
   \item{aa}{data frame, amino acid composition in the format of \code{thermo()$protein}}
+  \item{as.residue}{logical, normalize by protein length?}
   \item{protein}{character, name of protein; numeric, indices of proteins (rownumbers of \code{\link{thermo}$protein})}
   \item{sequence}{character, protein sequence}
   \item{abundance}{numeric, abundances of proteins}
@@ -39,6 +40,7 @@
 The name used in the output is taken from the first row of \code{aa} or from \code{protein} and \code{organism} if they are specified.
 
 Given amino acid compositions returned by the \code{*aa} functions described above, \code{add.protein} adds them to \code{thermo()$protein} for use by other functions in CHNOSZ.
+Set \code{as.residue} to TRUE to normalize by protein length; each input amino acid composition is divided by the corresponding number of residues, with the result that the sum of amino acid frequencies for each protein is 1.
 The amino acid compositions of proteins in \code{aa} with the same name as one in \code{thermo()$protein} are replaced.
 The value returned by this function is the rownumbers of \code{thermo()$protein} that are added and/or replaced.
 }
@@ -45,14 +47,16 @@
 
 \examples{
 \dontshow{reset()}
-# manually adding a new protein
-# Human Gastric juice peptide 1
+# Get the amino acid composition of a protein sequence
+# (Human Gastric juice peptide 1)
 aa <- seq2aa("GAJU_HUMAN", "LAAGKVEDSD")
+# Add the protein to CHNOSZ
 ip <- add.protein(aa)
+# Calculate the protein length and chemical formula
 protein.length(ip) # 10
-# the chemical formula of this peptide
 as.chemical.formula(protein.formula(ip)) # "C41H69N11O18"
-# we can also calculate a formula without using add.protein
+
+# Calculate a formula without using add.protein
 aa <- seq2aa("pentapeptide_test", "ANLSG")
 as.chemical.formula(protein.formula(aa))
 }

Added: pkg/CHNOSZ/man/affinity_rank.Rd
===================================================================
--- pkg/CHNOSZ/man/affinity_rank.Rd	                        (rev 0)
+++ pkg/CHNOSZ/man/affinity_rank.Rd	2022-04-16 08:19:03 UTC (rev 722)
@@ -0,0 +1,52 @@
+\encoding{UTF-8}
+\name{affinity_rank}
+\alias{affinity_rank}
+\title{Normalized Sums of Ranks of Chemical Affinities}
+\description{
+Affinity ranking for groups of species.
+}
+
+\usage{
+  affinity_rank(aout, groups)
+}
+
+\arguments{
+  \item{aout}{list, output of \code{\link{affinity}}}
+  \item{groups}{named list of indices (integer or numeric) for species in each group}
+}
+
+\details{
+The following calculations are applied to each set of conditions (i.e., grid point if \code{\link{affinity}} was called with two variables).
+The \code{\link{rank}}s of affinities for all species are first computed.
+Then, the ranks for the species in each group are summed and divided by the number of species in that group (this is the normalization step).
+}
+
+\value{
+The normalized sum of ranks are inserted into the \code{values} element of \code{aout}, and the names of the groups are inserted into the \code{species} element.
+The result can be used by \code{\link{diagram}} to show the groups with the highest normalized sum of ranks.
+}
+
+\note{
+The reaction coefficients in the \code{species} element of the returned value of \code{aout} are not valid.
+Because balancing on a basis species (i.e., dividing by its reaction coefficient) would be incorrect, \code{diagram} enforces \code{balance = 1} so that that the normalized sums of ranks are used as-is.
+}
+
+\examples{
+\dontshow{reset()}# Compare Rubisco proteins from three domains
+datfile <- system.file("extdata/cpetc/rubisco.csv", package = "CHNOSZ")
+fastafile <- system.file("extdata/protein/rubisco.fasta", package = "CHNOSZ")
+dat <- read.csv(datfile)
+aa <- read.fasta(fastafile)
+groups <- sapply(c("A", "B", "E"), "==", dat$domain, simplify = FALSE)
+names(groups) <- c("Archaea", "Bacteria", "Eukaryota")
+ip <- add.protein(aa, as.residue = TRUE)
+basis("QEC")
+aout <- affinity(O2 = c(-74, -66, 100), H2O = c(-4, 4, 100), iprotein = ip)
+arank <- affinity_rank(aout, groups = groups)
+nspecies <- sapply(groups, sum)
+names <- paste0(names(groups), " (", nspecies, ")")
+diagram(arank, fill = "terrain", font = 2, names = names, format.names = FALSE)
+title("Chemical affinity ranking of Rubisco proteins")
+}
+
+\concept{Extended workflow}

Modified: pkg/CHNOSZ/man/diagram.Rd
===================================================================
--- pkg/CHNOSZ/man/diagram.Rd	2022-04-11 02:27:40 UTC (rev 721)
+++ pkg/CHNOSZ/man/diagram.Rd	2022-04-16 08:19:03 UTC (rev 722)
@@ -38,7 +38,7 @@
 }
 
 \arguments{
-  \item{eout}{list, object returned by \code{\link{equilibrate}} or \code{\link{affinity}}}
+  \item{eout}{list, object returned by \code{\link{affinity}}, \code{\link{equilibrate}} or related functions}
   \item{type}{character, type of plot, or name of basis species whose activity to plot}
   \item{alpha}{logical or character (\samp{balance}), for speciation diagrams, plot degree of formation instead of activities?}
   \item{normalize}{logical, divide chemical affinities by balance coefficients and rescale activities to whole formulas?}
@@ -92,8 +92,8 @@
 
 \details{
 
-This function displays diagrams representing either chemical affinities, or equilibrium chemical activities of species.
-The first argument is the output from \code{\link{affinity}}, \code{\link{equilibrate}}, or \code{\link{solubility}}.
+This function displays diagrams representing either chemical affinities or equilibrium chemical activities of species.
+The first argument is the output from \code{\link{affinity}}, \code{\link{affinity_rank}}, \code{\link{equilibrate}}, or \code{\link{solubility}}.
 0-D diagrams, at a single point, are shown as \code{\link{barplot}}s.
 1-D diagrams, for a single variable on the x-axis, are plotted as lines.
 2-D diagrams, for two variables, are plotted as predominance fields.
@@ -114,7 +114,7 @@
 
 Normalizing protein formulas by length gives \dQuote{residue equivalents} (Dick and Shock, 2011) that are useful for equilibrium calculations with proteins.
 \code{normalize} and \code{as.residue} are only usable when \code{eout} is the output from \code{affinity}, and only one can be TRUE.
-If \code{normalize} is TRUE, the strategy is to divide reactions (formulas and affinities) by protein length, calculate activities of residues in equilibrium, but plot corresponding activities of the proteins.
+If \code{normalize} is TRUE, formation reactions and their affinities are first divided by protein length, so equal activities of residue equivalents are considered; then, the residue activities are rescaled to whole proteins for making the plot.
 If \code{as.residue} is TRUE, no rescaling is performed, so the diagram represents activities of the residues, not the whole proteins.
 
 }