[CHNOSZ-commits] r166 - in pkg/CHNOSZ: . R demo inst man tests/testthat vignettes

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Mon Feb 20 12:27:12 CET 2017


Author: jedick
Date: 2017-02-20 12:27:11 +0100 (Mon, 20 Feb 2017)
New Revision: 166

Added:
   pkg/CHNOSZ/R/add.protein.R
   pkg/CHNOSZ/man/add.protein.Rd
   pkg/CHNOSZ/tests/testthat/test-add.protein.R
Removed:
   pkg/CHNOSZ/R/iprotein.R
   pkg/CHNOSZ/man/iprotein.Rd
   pkg/CHNOSZ/tests/testthat/test-iprotein.R
Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/examples.R
   pkg/CHNOSZ/R/info.R
   pkg/CHNOSZ/R/protein.info.R
   pkg/CHNOSZ/R/util.affinity.R
   pkg/CHNOSZ/R/util.expression.R
   pkg/CHNOSZ/demo/bugstab.R
   pkg/CHNOSZ/demo/protein.equil.R
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/man/data.Rd
   pkg/CHNOSZ/man/info.Rd
   pkg/CHNOSZ/man/ionize.aa.Rd
   pkg/CHNOSZ/man/protein.Rd
   pkg/CHNOSZ/man/protein.info.Rd
   pkg/CHNOSZ/tests/testthat/test-affinity.R
   pkg/CHNOSZ/tests/testthat/test-ionize.aa.R
   pkg/CHNOSZ/tests/testthat/test-protein.info.R
   pkg/CHNOSZ/tests/testthat/test-util.affinity.R
   pkg/CHNOSZ/vignettes/anintro.Rmd
   pkg/CHNOSZ/vignettes/equilibrium.Rnw
   pkg/CHNOSZ/vignettes/equilibrium.lyx
   pkg/CHNOSZ/vignettes/hotspring.Rnw
   pkg/CHNOSZ/vignettes/hotspring.lyx
Log:
rename iprotein() to protein.info()


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================
--- pkg/CHNOSZ/DESCRIPTION	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/DESCRIPTION	2017-02-20 11:27:11 UTC (rev 166)
@@ -1,6 +1,6 @@
-Date: 2017-02-19
+Date: 2017-02-20
 Package: CHNOSZ
-Version: 1.0.8-55
+Version: 1.0.8-56
 Title: Chemical Thermodynamics and Activity Diagrams
 Author: Jeffrey Dick
 Maintainer: Jeffrey Dick <j3ffdick at gmail.com>

Copied: pkg/CHNOSZ/R/add.protein.R (from rev 165, pkg/CHNOSZ/R/iprotein.R)
===================================================================
--- pkg/CHNOSZ/R/add.protein.R	                        (rev 0)
+++ pkg/CHNOSZ/R/add.protein.R	2017-02-20 11:27:11 UTC (rev 166)
@@ -0,0 +1,164 @@
+# CHNOSZ/add.protein.R
+# calculate properties of proteins 20061109 jmd
+# reorganize protein functions 20120513
+
+# add.protein - add amino acid counts to thermo$protein (returns iprotein)
+# ip2aa - select amino acid counts (data frame) from thermo$protein
+# aa2eos - perform group additivity calculations
+# seq2aa - calculate amino acid counts from a sequence
+# aasum - combine amino acid counts (sum, average, or weighted sum by abundance)
+# read.aa - read amino acid counts from a file
+
+ip2aa <- function(protein, organism=NULL, residue=FALSE) {
+  # return amino acid counts (rows from thermo$protein)
+  # or 'protein' if it is a data frame
+  if(is.data.frame(protein)) return(protein)
+  iprotein <- protein.info(protein, organism)
+  # drop NA matches
+  iprotein <- iprotein[!is.na(iprotein)]
+  out <- get("thermo")$protein[iprotein, ]
+  # compute per-residue counts
+  if(residue) out[, 5:25] <- out[, 5:25]/rowSums(out[, 6:25])
+  return(out)
+}
+
+aa2eos <- function(aa, state=get("thermo")$opt$state) {
+  # display and return the properties of
+  # proteins calculated from amino acid composition
+  # the names of the protein backbone groups depend on the state
+  # [UPBB] for aq or [PBB] for cr
+  if(state=="aq") bbgroup <- "UPBB" else bbgroup <- "PBB"
+  # names of the AABB, sidechain and protein backbone groups
+  groups <- c("AABB", colnames(aa)[6:25], bbgroup)
+  # put brackets around the group names
+  groups <- paste("[", groups, "]", sep="")
+  # the rownumbers of the groups in thermo$obigt
+  groups_state <- paste(groups, state)
+  obigt <- get("thermo")$obigt
+  obigt_state <- paste(obigt$name, obigt$state)
+  igroup <- match(groups_state, obigt_state)
+  # the properties are in columns 8-20 of thermo$obigt
+  groupprops <- obigt[igroup, 8:20]
+  # the elements in each of the groups
+  groupelements <- i2A(igroup)
+  # a function to work on a single row of aa
+  eosfun <- function(aa) {
+    # numbers of groups: chains [=AABB], sidechains, protein backbone
+    nchains <- as.numeric(aa[, 5])
+    length <- sum(as.numeric(aa[, 6:25]))
+    npbb <- length - nchains
+    ngroups <- c(nchains, as.numeric(aa[, 6:25]), npbb)
+    # the actual adding and multiplying of thermodynamic properties
+    # hmm. seems like we have to split up the multiplication/transposition
+    # operations to get the result into multiple columns. 20071213
+    eos <- t(data.frame(colSums(groupprops * ngroups)))
+    # to get the formula, add up and round the group compositions 20090331
+    f.in <- round(colSums(groupelements * ngroups), 3)
+    # take out any elements that don't appear (sometimes S)
+    f.in <- f.in[f.in!=0]
+    # turn it into a formula
+    f <- as.chemical.formula(f.in)
+    # now the species name
+    name <- paste(aa$protein, aa$organism, sep="_")
+    # make some noise for the user
+    message("aa2eos: found ", appendLF=FALSE)
+    message(name, " (", f, ", ", appendLF=FALSE)
+    message(round(length, 3), " residues)")
+    ref <- aa$ref
+    header <- data.frame(name=name, abbrv=NA, formula=f, state=state, ref1=ref, ref2=NA, date=NA, stringsAsFactors=FALSE)
+    eosout <- cbind(header, eos)
+    return(eosout)
+  }
+  # loop over each row of aa
+  out <- lapply(1:nrow(aa), function(i) eosfun(aa[i, ]))
+  out <- do.call(rbind, out)
+  rownames(out) <- NULL
+  return(out)
+}
+
+seq2aa <- function(protein, sequence) {
+  # remove newlines and whitespace
+  sequence <- gsub("\\s", "", gsub("[\r\n]", "", sequence))
+  # make a data frame from counting the amino acids in the sequence
+  caa <- count.aa(sequence)
+  colnames(caa) <- aminoacids(3)
+  # a protein with no amino acids is sort of boring
+  if(all(caa==0)) stop("no characters match an amino acid")
+  ip <- suppressMessages(protein.info(protein))
+  # now make the data frame
+  po <- strsplit(protein, "_")[[1]]
+  aa <- data.frame(protein=po[1], organism=po[2], ref=NA, abbrv=NA, stringsAsFactors=FALSE)
+  aa <- cbind(aa, chains=1, caa)
+  return(aa)
+}
+
+aasum <- function(aa, abundance=1, average=FALSE, protein=NULL, organism=NULL) {
+  # returns the sum of the amino acid counts in aa,
+  # multiplied by the abundances of the proteins
+  abundance <- rep(abundance, length.out=nrow(aa))
+  # drop any NA rows or abundances
+  ina.aa <- is.na(aa$chains)
+  ina.ab <- is.na(abundance)
+  ina <- ina.aa | ina.ab
+  if(any(ina)) {
+    aa <- aa[!ina, ]
+    abundance <- abundance[!ina]
+    message("aasum: dropped ", sum(ina), " proteins with NA composition and/or abundance")
+  }
+  # we don't know how to deal with different numbers of polypeptide chains
+  if(!all(aa$chains==aa$chains[1])) stop("different numbers of polypeptide chains")
+  # multiply
+  aa[, 6:25] <- aa[, 6:25] * abundance
+  # sum
+  out <- aa[1, ]
+  out[, 5:25] <- colSums(aa[, 5:25])
+  # average if told to do so
+  if(average) {
+    # polypeptide chains by number of proteins, residues by frequence
+    out[, 5] <- out[, 5]/nrow(aa)
+    out[, 6:25] <- out[, 6:25]/sum(abundance)
+  }
+  # add protein and organism names if given
+  if(!is.null(protein)) out$protein <- protein
+  if(!is.null(organism)) out$organism <- organism
+  return(out)
+}
+
+read.aa <- function(file="protein.csv", ...) {
+  # 20090428 added colClasses here
+  # 20140128 added as.is=TRUE (in case numeric values are stored in ref or abbrv column)
+  aa <- read.csv(file, colClasses=c(rep("character", 2), NA, NA, rep("numeric", 21)), as.is=TRUE, ...)
+  if(!identical(colnames(aa), colnames(get("thermo")$protein)))
+    stop(paste("format of", file, "is incompatible with thermo$protein"))
+  return(aa)
+}
+
+add.protein <- function(aa) {
+  # add a properly constructed data frame of 
+  # amino acid counts to thermo$protein
+  thermo <- get("thermo")
+  if(!identical(colnames(aa), colnames(thermo$protein)))
+    stop("the value of 'aa' is not a data frame with the same columns as thermo$protein")
+  # find any protein IDs that are duplicated
+  po <- paste(aa$protein, aa$organism, sep="_")
+  ip <- suppressMessages(protein.info(po))
+  ipdup <- !is.na(ip)
+  # now we're ready to go
+  tp.new <- thermo$protein
+  if(!all(ipdup)) tp.new <- rbind(tp.new, aa[!ipdup, ])
+  if(any(ipdup)) {
+    if(any(sapply(1:4, function(i){is.factor(aa[, i])})))
+      stop(paste("converting factors causes problems replacing protein data",
+        "  data file should be read using e.g. aa <- read.csv(file, stringsAsFactors=FALSE)", sep="\n"))
+    tp.new[ip[ipdup], ] <- aa[ipdup, ]
+  }
+  rownames(tp.new) <- NULL
+  thermo$protein <- tp.new
+  assign("thermo", thermo, "CHNOSZ")
+  # return the new rownumbers
+  ip <- protein.info(po)
+  # make some noise
+  if(!all(ipdup)) message("add.protein: added ", nrow(aa)-sum(ipdup), " new protein(s) to thermo$protein")
+  if(any(ipdup)) message("add.protein: replaced ", sum(ipdup), " existing protein(s) in thermo$protein")
+  return(ip)
+}

Modified: pkg/CHNOSZ/R/examples.R
===================================================================
--- pkg/CHNOSZ/R/examples.R	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/R/examples.R	2017-02-20 11:27:11 UTC (rev 166)
@@ -11,7 +11,7 @@
     "util.misc", "util.program",
     "util.seq", "util.units", "taxonomy", "info", "protein.info", "hkf", "water", "subcrt",
     "makeup", "basis", "swap.basis", "species", "affinity", "util.affinity", "equil.boltzmann", 
-    "diagram", "buffer", "iprotein", "protein", "ionize.aa", "more.aa", "read.expr",
+    "diagram", "buffer", "add.protein", "protein", "ionize.aa", "more.aa", "read.expr",
     "objective", "revisit", "transfer", "anim", "EOSregress", "wjd")
   plot.it <- FALSE
   if(is.character(do.png))

Modified: pkg/CHNOSZ/R/info.R
===================================================================
--- pkg/CHNOSZ/R/info.R	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/R/info.R	2017-02-20 11:27:11 UTC (rev 166)
@@ -31,7 +31,7 @@
   # since thermo$obigt$abbrv contains NAs, convert NA results to FALSE
   matches.species[is.na(matches.species)] <- FALSE
   # turn it in to no match if it's a protein in the wrong state
-  ip <- suppressMessages(iprotein(species))
+  ip <- suppressMessages(protein.info(species))
   if(any(matches.species) & !is.na(ip) & !is.null(state)) {
     matches.state <- matches.species & grepl(state, thermo$obigt$state)
     if(!any(matches.state)) matches.species <- FALSE

Deleted: pkg/CHNOSZ/R/iprotein.R
===================================================================
--- pkg/CHNOSZ/R/iprotein.R	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/R/iprotein.R	2017-02-20 11:27:11 UTC (rev 166)
@@ -1,191 +0,0 @@
-# CHNOSZ/iprotein.R
-# calculate properties of proteins 20061109 jmd
-# reorganize protein functions 20120513
-
-# iprotein - find rownumber in thermo$protein
-# ip2aa - select amino acid counts (data frame) from thermo$protein
-# aa2eos - perform group additivity calculations
-# seq2aa - calculate amino acid counts from a sequence
-# aasum - combine amino acid counts (sum, average, or weighted sum by abundance)
-# read.aa - read amino acid counts from a file
-# add.protein - add amino acid counts to thermo$protein (returns iprotein)
-
-iprotein <- function(protein, organism=NULL) {
-  # find the rownumber(s) of thermo$protein that matches
-  # 'protein' numeric (the rownumber itself)
-  # 'protein' character, e.g. LYSC_CHICK
-  # 'protein' and 'organism', e.g. 'LYSC', 'CHICK'
-  thermo <- get("thermo")
-  if(is.numeric(protein)) {
-    iproteins <- 1:nrow(thermo$protein)
-    protein[!protein %in% iproteins] <- NA
-    iprotein <- protein
-  } else {
-    # from here we'll search by protein/organism pairs
-    tp.po <- paste(thermo$protein$protein, thermo$protein$organism, sep="_")
-    if(is.null(organism)) my.po <- protein
-    else my.po <- paste(protein, organism, sep="_")
-    iprotein <- match(my.po, tp.po)
-  }
-  # tell the user about NA's
-  if(any(is.na(iprotein))) {
-    nNA <- sum(is.na(iprotein))
-    if(nNA==1) ptext <- "" else ptext <- "s"
-    message("iprotein: ", sum(is.na(iprotein)), " protein", ptext, " not matched")
-  }
-  return(iprotein)
-}
-
-ip2aa <- function(protein, organism=NULL, residue=FALSE) {
-  # return amino acid counts (rows from thermo$protein)
-  # or 'protein' if it is a data frame
-  if(is.data.frame(protein)) return(protein)
-  iprotein <- iprotein(protein, organism)
-  # drop NA matches
-  iprotein <- iprotein[!is.na(iprotein)]
-  out <- get("thermo")$protein[iprotein, ]
-  # compute per-residue counts
-  if(residue) out[, 5:25] <- out[, 5:25]/rowSums(out[, 6:25])
-  return(out)
-}
-
-aa2eos <- function(aa, state=get("thermo")$opt$state) {
-  # display and return the properties of
-  # proteins calculated from amino acid composition
-  # the names of the protein backbone groups depend on the state
-  # [UPBB] for aq or [PBB] for cr
-  if(state=="aq") bbgroup <- "UPBB" else bbgroup <- "PBB"
-  # names of the AABB, sidechain and protein backbone groups
-  groups <- c("AABB", colnames(aa)[6:25], bbgroup)
-  # put brackets around the group names
-  groups <- paste("[", groups, "]", sep="")
-  # the rownumbers of the groups in thermo$obigt
-  groups_state <- paste(groups, state)
-  obigt <- get("thermo")$obigt
-  obigt_state <- paste(obigt$name, obigt$state)
-  igroup <- match(groups_state, obigt_state)
-  # the properties are in columns 8-20 of thermo$obigt
-  groupprops <- obigt[igroup, 8:20]
-  # the elements in each of the groups
-  groupelements <- i2A(igroup)
-  # a function to work on a single row of aa
-  eosfun <- function(aa) {
-    # numbers of groups: chains [=AABB], sidechains, protein backbone
-    nchains <- as.numeric(aa[, 5])
-    length <- sum(as.numeric(aa[, 6:25]))
-    npbb <- length - nchains
-    ngroups <- c(nchains, as.numeric(aa[, 6:25]), npbb)
-    # the actual adding and multiplying of thermodynamic properties
-    # hmm. seems like we have to split up the multiplication/transposition
-    # operations to get the result into multiple columns. 20071213
-    eos <- t(data.frame(colSums(groupprops * ngroups)))
-    # to get the formula, add up and round the group compositions 20090331
-    f.in <- round(colSums(groupelements * ngroups), 3)
-    # take out any elements that don't appear (sometimes S)
-    f.in <- f.in[f.in!=0]
-    # turn it into a formula
-    f <- as.chemical.formula(f.in)
-    # now the species name
-    name <- paste(aa$protein, aa$organism, sep="_")
-    # make some noise for the user
-    message("aa2eos: found ", appendLF=FALSE)
-    message(name, " (", f, ", ", appendLF=FALSE)
-    message(round(length, 3), " residues)")
-    ref <- aa$ref
-    header <- data.frame(name=name, abbrv=NA, formula=f, state=state, ref1=ref, ref2=NA, date=NA, stringsAsFactors=FALSE)
-    eosout <- cbind(header, eos)
-    return(eosout)
-  }
-  # loop over each row of aa
-  out <- lapply(1:nrow(aa), function(i) eosfun(aa[i, ]))
-  out <- do.call(rbind, out)
-  rownames(out) <- NULL
-  return(out)
-}
-
-seq2aa <- function(protein, sequence) {
-  # remove newlines and whitespace
-  sequence <- gsub("\\s", "", gsub("[\r\n]", "", sequence))
-  # make a data frame from counting the amino acids in the sequence
-  caa <- count.aa(sequence)
-  colnames(caa) <- aminoacids(3)
-  # a protein with no amino acids is sort of boring
-  if(all(caa==0)) stop("no characters match an amino acid")
-  ip <- suppressMessages(iprotein(protein))
-  # now make the data frame
-  po <- strsplit(protein, "_")[[1]]
-  aa <- data.frame(protein=po[1], organism=po[2], ref=NA, abbrv=NA, stringsAsFactors=FALSE)
-  aa <- cbind(aa, chains=1, caa)
-  return(aa)
-}
-
-aasum <- function(aa, abundance=1, average=FALSE, protein=NULL, organism=NULL) {
-  # returns the sum of the amino acid counts in aa,
-  # multiplied by the abundances of the proteins
-  abundance <- rep(abundance, length.out=nrow(aa))
-  # drop any NA rows or abundances
-  ina.aa <- is.na(aa$chains)
-  ina.ab <- is.na(abundance)
-  ina <- ina.aa | ina.ab
-  if(any(ina)) {
-    aa <- aa[!ina, ]
-    abundance <- abundance[!ina]
-    message("aasum: dropped ", sum(ina), " proteins with NA composition and/or abundance")
-  }
-  # we don't know how to deal with different numbers of polypeptide chains
-  if(!all(aa$chains==aa$chains[1])) stop("different numbers of polypeptide chains")
-  # multiply
-  aa[, 6:25] <- aa[, 6:25] * abundance
-  # sum
-  out <- aa[1, ]
-  out[, 5:25] <- colSums(aa[, 5:25])
-  # average if told to do so
-  if(average) {
-    # polypeptide chains by number of proteins, residues by frequence
-    out[, 5] <- out[, 5]/nrow(aa)
-    out[, 6:25] <- out[, 6:25]/sum(abundance)
-  }
-  # add protein and organism names if given
-  if(!is.null(protein)) out$protein <- protein
-  if(!is.null(organism)) out$organism <- organism
-  return(out)
-}
-
-read.aa <- function(file="protein.csv", ...) {
-  # 20090428 added colClasses here
-  # 20140128 added as.is=TRUE (in case numeric values are stored in ref or abbrv column)
-  aa <- read.csv(file, colClasses=c(rep("character", 2), NA, NA, rep("numeric", 21)), as.is=TRUE, ...)
-  if(!identical(colnames(aa), colnames(get("thermo")$protein)))
-    stop(paste("format of", file, "is incompatible with thermo$protein"))
-  return(aa)
-}
-
-add.protein <- function(aa) {
-  # add a properly constructed data frame of 
-  # amino acid counts to thermo$protein
-  thermo <- get("thermo")
-  if(!identical(colnames(aa), colnames(thermo$protein)))
-    stop("the value of 'aa' is not a data frame with the same columns as thermo$protein")
-  # find any protein IDs that are duplicated
-  po <- paste(aa$protein, aa$organism, sep="_")
-  ip <- suppressMessages(iprotein(po))
-  ipdup <- !is.na(ip)
-  # now we're ready to go
-  tp.new <- thermo$protein
-  if(!all(ipdup)) tp.new <- rbind(tp.new, aa[!ipdup, ])
-  if(any(ipdup)) {
-    if(any(sapply(1:4, function(i){is.factor(aa[, i])})))
-      stop(paste("converting factors causes problems replacing protein data",
-        "  data file should be read using e.g. aa <- read.csv(file, stringsAsFactors=FALSE)", sep="\n"))
-    tp.new[ip[ipdup], ] <- aa[ipdup, ]
-  }
-  rownames(tp.new) <- NULL
-  thermo$protein <- tp.new
-  assign("thermo", thermo, "CHNOSZ")
-  # return the new rownumbers
-  ip <- iprotein(po)
-  # make some noise
-  if(!all(ipdup)) message("add.protein: added ", nrow(aa)-sum(ipdup), " new protein(s) to thermo$protein")
-  if(any(ipdup)) message("add.protein: replaced ", sum(ipdup), " existing protein(s) in thermo$protein")
-  return(ip)
-}

Modified: pkg/CHNOSZ/R/protein.info.R
===================================================================
--- pkg/CHNOSZ/R/protein.info.R	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/R/protein.info.R	2017-02-20 11:27:11 UTC (rev 166)
@@ -2,12 +2,38 @@
 # calculate formulas and summarize properties of proteins
 # MP90.cp: additive heat capacity from groups of Makhatadze and Privalov, 1990
 # group.formulas: chemical makeup of the amino acid residues
+# protein.info: find rownumber in thermo$protein
 # protein.formula: chemical makeup of the indicated proteins
 # protein.length: lengths of the indicated proteins
-# protein.info: summarize properties of proteins
 # protein.basis: coefficients of basis species in formation reactions of [ionized] proteins [residues]
 # protein.equil: step-by-step example of protein equilibrium calculation
 
+protein.info <- function(protein, organism=NULL) {
+  # find the rownumber(s) of thermo$protein that matches
+  # 'protein' numeric (the rownumber itself)
+  # 'protein' character, e.g. LYSC_CHICK
+  # 'protein' and 'organism', e.g. 'LYSC', 'CHICK'
+  thermo <- get("thermo")
+  if(is.numeric(protein)) {
+    iproteins <- 1:nrow(thermo$protein)
+    protein[!protein %in% iproteins] <- NA
+    iprotein <- protein
+  } else {
+    # from here we'll search by protein/organism pairs
+    tp.po <- paste(thermo$protein$protein, thermo$protein$organism, sep="_")
+    if(is.null(organism)) my.po <- protein
+    else my.po <- paste(protein, organism, sep="_")
+    iprotein <- match(my.po, tp.po)
+  }
+  # tell the user about NA's
+  if(any(is.na(iprotein))) {
+    nNA <- sum(is.na(iprotein))
+    if(nNA==1) ptext <- "" else ptext <- "s"
+    message("iprotein: ", sum(is.na(iprotein)), " protein", ptext, " not matched")
+  }
+  return(iprotein)
+}
+
 MP90.cp <- function(protein, T) {
   # T (temperature, degrees C), protein (name of protein)
   # returns heat capacity of protein (kj/mol)
@@ -49,7 +75,6 @@
   return(cnew)
 }
 
-
 group.formulas <- function() {
   # return a matrix with chemical formulas of residues
   # names of the sidechain groups
@@ -84,56 +109,6 @@
   return(pl)
 }
 
-protein.info <- function(protein, T=25, residue=FALSE, round.it=FALSE) {
-  # make a table of selected properties for proteins
-  # listed in protein
-  aa <- ip2aa(protein)
-  pname <- paste(aa$protein, aa$organism, sep="_")
-  length <- protein.length(aa)
-  pf <- protein.formula(aa)
-  G <- unlist(subcrt(pname, T=T, property="G")$out)
-  Z <- rep(NA, length(pname))
-  G.Z <- rep(NA, length(pname))
-  ZC <- ZC(pf)
-  # run ionization calculations if we have H+
-  thermo <- get("thermo")
-  if(!is.null(thermo$basis)) {
-    iHplus <- match("H+", rownames(thermo$basis))
-    if(!is.na(iHplus)) {
-      pH <- -thermo$basis$logact[iHplus]
-      Z <- ionize.aa(aa, T=T, pH=pH)[1, ]
-      G.ionization <- ionize.aa(aa, T=T, pH=pH, property="G")[1, ]
-      G.Z <- G + G.ionization
-      # add charge to the chemical formulas
-      pf <- cbind(pf, Z)
-      iH <- match("H", colnames(pf))
-      pf[, iH] <- pf[, iH] + Z
-    }
-  }
-  # take care of residue conversion
-  if(residue) {
-    pf <- pf / length
-    G <- G / length
-    Z <- Z / length
-    G.Z <- G.Z / length
-    length <- length / length
-    # round the coefficients in the formulas
-    if(round.it) pf <- round(pf, 3)
-  }
-  # convert each protein formula to a single line
-  formula <- as.chemical.formula(round(pf, 3))
-  if(round.it) {
-    length <- round(length, 1)
-    G <- round(G, 3)
-    Z <- round(Z, 3)
-    G.Z <- round(G.Z, 3)
-    ZC <- round(ZC, 3)
-  }
-  out <- data.frame(protein=pname, length=length, formula=formula, G=G, Z=Z, G.Z=G.Z, ZC=ZC, stringsAsFactors=FALSE)
-  rownames(out) <- NULL
-  return(out)
-}
-
 protein.basis <- function(protein, T=25, normalize=FALSE) {
   # 20090902 calculate the coefficients of basis species in reactions
   # to form proteins (possibly per normalized by length) listed in protein

Modified: pkg/CHNOSZ/R/util.affinity.R
===================================================================
--- pkg/CHNOSZ/R/util.affinity.R	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/R/util.affinity.R	2017-02-20 11:27:11 UTC (rev 166)
@@ -162,7 +162,7 @@
       isprotein <- grepl("_", myspecies$name)
       if(any(isprotein)) {
         # the rownumbers in thermo$protein
-        ip <- iprotein(myspecies$name[isprotein])
+        ip <- protein.info(myspecies$name[isprotein])
         # get the affinity of ionization
         iHplus <- match("H+", rownames(mybasis))
         # as.numeric is needed in case the logact column is character mode

Modified: pkg/CHNOSZ/R/util.expression.R
===================================================================
--- pkg/CHNOSZ/R/util.expression.R	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/R/util.expression.R	2017-02-20 11:27:11 UTC (rev 166)
@@ -7,6 +7,8 @@
   # that include subscripts, superscripts (if charged)
   # and optionally designations of states +/- loga or logf prefix
   if(length(species) > 1) (stop("more than one species"))
+  # convert to character so that "1", "2", etc. don't get converted to chemical formulas via makeup()
+  species <- as.character(species)
   # the counts of elements in the species:
   # here we don't care too much if an "element" is a real element
   # (listed in thermo$element), so we suppress warnings

Modified: pkg/CHNOSZ/demo/bugstab.R
===================================================================
--- pkg/CHNOSZ/demo/bugstab.R	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/demo/bugstab.R	2017-02-20 11:27:11 UTC (rev 166)
@@ -2,13 +2,14 @@
 # based on "bugstab" function in Supporting Information of Dick, 2016
 # (http://dx.doi.org/10.7717/peerj.2238)
 
+# set up graphics device
+layout(cbind(matrix(sapply(list(c(1, 2), c(3, 4)), function(x) rep(rep(x, each=3), 3)), nrow=6, byrow=TRUE),
+             matrix(rep(c(0, 5, 5, 5, 5, 0), each=4), nrow=6, byrow=TRUE)))
+opar <- par(mar=c(3.3, 3.3, 1.5, 1.5), mgp=c(2.1, 0.7, 0), xaxs="i", yaxs="i", las=1, cex=0.9)
 # resolution for plots
 res <- 500
 # basis can be "QEC" or "CHNOS"
 basis <- "QEC"
-layout(cbind(matrix(sapply(list(c(1, 2), c(3, 4)), function(x) rep(rep(x, each=3), 3)), nrow=6, byrow=TRUE),
-             matrix(rep(c(0, 5, 5, 5, 5, 0), each=4), nrow=6, byrow=TRUE)))
-par(mar=c(3.3, 3.3, 1.5, 1.5), mgp=c(2.1, 0.7, 0), xaxs="i", yaxs="i", las=1, cex=0.9)
 # read bioproject ids, species names
 mfile <- system.file("extdata/abundance/microbes.csv", package="CHNOSZ")
 bugs <- read.csv(mfile, as.is=TRUE)
@@ -65,3 +66,7 @@
 title(main="Cumulative stability count")
 box()
 label.figure("E", yfrac=0.96, paren=FALSE, font=2, cex=1)
+
+# reset graphics device to default
+par(opar)
+layout(1)

Modified: pkg/CHNOSZ/demo/protein.equil.R
===================================================================
--- pkg/CHNOSZ/demo/protein.equil.R	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/demo/protein.equil.R	2017-02-20 11:27:11 UTC (rev 166)
@@ -1,6 +1,6 @@
 ## steps in calculation of chemical activities of two proteins
 ## in metastable equilibrium, after Dick and Shock, 2011
-protein <- iprotein(c("CSG_METVO", "CSG_METJA"))
+protein <- protein.info(c("CSG_METVO", "CSG_METJA"))
 # clear out amino acid residues loaded by the example above
 # ( in affinity(iprotein=ip) )
 data(thermo)

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/inst/NEWS	2017-02-20 11:27:11 UTC (rev 166)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 1.0.8-55 (2017-02-19)
+CHANGES IN CHNOSZ 1.0.8-56 (2017-02-20)
 ---------------------------------------
 
 DOCUMENTATION:
@@ -118,6 +118,9 @@
 - Rename browse.refs() to thermo.refs(); remove URL browsing (except for
   summary table).
 
+- Rename iprotein() to protein.info(), replacing the previous function
+  of the same name.
+
 CHANGES IN CHNOSZ 1.0.8 (2016-05-28)
 ------------------------------------
 

Copied: pkg/CHNOSZ/man/add.protein.Rd (from rev 165, pkg/CHNOSZ/man/iprotein.Rd)
===================================================================
--- pkg/CHNOSZ/man/add.protein.Rd	                        (rev 0)
+++ pkg/CHNOSZ/man/add.protein.Rd	2017-02-20 11:27:11 UTC (rev 166)
@@ -0,0 +1,86 @@
+\name{add.protein}
+\alias{ip2aa}
+\alias{aa2eos}
+\alias{seq2aa}
+\alias{aasum}
+\alias{read.aa}
+\alias{add.protein}
+\title{Amino Acid Compositions of Proteins}
+\description{
+  Functions to identify proteins, get and set amino acid compositions, and calculate thermodynamic properties from group additivity.
+}
+
+\usage{
+  ip2aa(protein, organism=NULL, residue=FALSE)
+  aa2eos(aa, state=get("thermo")$opt$state)
+  seq2aa(protein, sequence)
+  aasum(aa, abundance = 1, average = FALSE, protein = NULL, organism = NULL)
+  read.aa(file = "protein.csv", ...)
+  add.protein(aa)
+}
+
+\arguments{
+  \item{protein}{character, name of protein; numeric, indices of proteins (rownumbers of \code{\link{thermo}$protein})}
+  \item{organism}{character, name of organism}
+  \item{residue}{logical, compute per-residue counts?}
+  \item{aa}{data frame, amino acid composition in the format of \code{thermo$protein}}
+  \item{state}{character, physical state}
+  \item{sequence}{character, protein sequence}
+  \item{abundance}{numeric, abundances of proteins}
+  \item{average}{logical, return the weighted average of amino acid counts?}
+  \item{file}{character, path to file with amino acid compositions}
+  \item{...}{additional arguments passed to \code{\link{read.csv}}}
+}
+
+\details{
+  A \samp{protein} in CHNOSZ is defined by a name and by the counts of amino acids, stored in \code{\link{thermo}$protein}. The purpose of the functions described here is to identify proteins and work with their amino acid compositions. From the amino acid compositions, the thermodynamic properties of the proteins can be estimated (Dick et al., 2006) for use in other functions in the package. 
+
+  Often, the names of proteins are sufficient to set up calculations using functions such as \code{\link{subcrt}} or \code{\link{species}}. The names of proteins in CHNOSZ are distinguished from those of other chemical species by having an underscore character ("_") that separates two identifiers, referred to as the \code{protein} and \code{organism} (but any other meaning can be attached to these names). An example is \samp{LYSC_CHICK}. 
+
+  The first few functions provide low-level operations:
+
+  \code{ip2aa} returns the row(s) of \code{thermo$protein} that match the supplied protein names, OR the protein indices found by \code{iprotin}. Set \code{residue} to TRUE to return the per-residue composition (i.e. amino acid composition of the protein divided by total number of residues). For this function only, if the \code{protein} argument is a data frame, it is returned unchanged, except for possibly the per-residue calculation.
+
+  \code{aa2eos} calculates the thermodynamic properties and equations-of-state parameters for the completely nonionized proteins using group additivity with parameters taken from Dick et al., 2006 (aqueous proteins) and LaRowe and Dick, 2012 (crystalline proteins and revised aqueous methionine sidechain group). The return value is a data frame in the same format as \code{thermo$obigt}. \code{state} indicates the physical state for the parameters used in the calculation (\samp{aq} or \samp{cr}).
+
+  The remaining functions are more likely to be called directly by the user:
+
+  \code{seq2aa} returns a data frame of amino acid composition, in the format of \code{thermo$protein}, corresponding to the provided \code{sequence}. Here, the \code{protein} argument indicates the name of the protein with an underscore (e.g. \samp{LYSC_CHICK}).
+
+  \code{aasum} returns a data frame representing the sum of amino acid compositions in the rows of the input \code{aa} data frame. The amino acid compositions are multiplied by the indicated \code{abundance}; that argument is recycled to match the number of rows of \code{aa}. If \code{average} is TRUE the final sum is divided by the number of input compositions. The name used in the output is taken from the first row of \code{aa} or from \code{protein} and \code{organism} if they are specified. This function is useful for calculating bulk amino acid compositions in stress response experiments or localization studies; see \code{\link{read.expr}} for examples of its use.
+
+\code{read.aa} returns a data frame of amino acid composition based on the contents of the indicated \code{file}, which should be a CSV file with the same column names as \code{thermo$protein}.
+
+\code{add.protein} completes the loop; any amino acid composition returned by the \code{*aa} functions described above can be added to \code{thermo$protein} using this function to be made available to other functions in the package.
+The amino acid compositions of proteins in \code{aa} with the same name as one in \code{thermo$protein} are replaced.
+The value returned by this function is the rownumbers of \code{thermo$protein} that are added and/or replaced.
+
+}
+
+\examples{
+\dontshow{data(thermo)}
+# manually adding a new protein
+# Human Gastric juice peptide 1
+aa <- seq2aa("GAJU_HUMAN", "LAAGKVEDSD")
+ip <- add.protein(aa)
+stopifnot(protein.length(ip)==10)
+# the chemical formula of this peptide
+stopifnot(as.chemical.formula(protein.formula(ip))=="C41H69N11O18")
+# we can also calculate a formula without using add.protein
+as.chemical.formula(protein.formula(seq2aa("pentapeptide_test", "ANLSG")))
+}
+
+\seealso{
+\code{\link{read.fasta}} and \code{\link{uniprot.aa}} for getting amino acid compositions from a FASTA file or downloading them from UniProt, and \code{\link{more.aa}} for getting amino acid compositions for model organisms from additional data files in the \code{\link{extdata}/protein} directory.
+
+\code{\link{protein.info}} for protein-level functions (chemical formulas, summaries of reaction coefficients and energies), and \code{\link{read.expr}} for working with protein abundance and subcellular localization data.
+
+Examples of stability calculations for proteins are in \code{\link{protein}}.
+}
+
+\references{
+  Dick, J. M., LaRowe, D. E. and Helgeson, H. C. (2006) Temperature, pressure, and electrochemical constraints on protein speciation: Group additivity calculation of the standard molal thermodynamic properties of ionized unfolded proteins. \emph{Biogeosciences} \bold{3}, 311--336. \url{http://dx.doi.org/10.5194/bg-3-311-2006}
+
+}
+
+\concept{Protein thermodynamic modeling}

Modified: pkg/CHNOSZ/man/data.Rd
===================================================================
--- pkg/CHNOSZ/man/data.Rd	2017-02-19 15:26:25 UTC (rev 165)
+++ pkg/CHNOSZ/man/data.Rd	2017-02-20 11:27:11 UTC (rev 166)
@@ -166,7 +166,7 @@
     }
 
     \item \code{thermo$protein}
-    Data frame of amino acid compositions of selected proteins. Many of the compositions were taken from the SWISS-PROT/UniProt online database (Boeckmann et al., 2003) and the protein and organism names usually follow the conventions adopted there. In some cases different isoforms of proteins are identified using modifications of the protein names; for example, \samp{MOD5.M} and \code{MOD5.N} proteins of \samp{YEAST} denote the mitochondrial and nuclear isoforms of this protein. See \code{\link{iprotein}} to search this data frame by protein name, and other functions to work with the amino acid compositions.
+    Data frame of amino acid compositions of selected proteins. Many of the compositions were taken from the SWISS-PROT/UniProt online database (Boeckmann et al., 2003) and the protein and organism names usually follow the conventions adopted there. In some cases different isoforms of proteins are identified using modifications of the protein names; for example, \samp{MOD5.M} and \code{MOD5.N} proteins of \samp{YEAST} denote the mitochondrial and nuclear isoforms of this protein. See \code{\link{protein.info}} to search this data frame by protein name, and other functions to work with the amino acid compositions.
     \tabular{lll}{
       \code{protein} \tab character \tab Identification of protein\cr
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/chnosz -r 166


More information about the CHNOSZ-commits mailing list