[CHNOSZ-commits] r174 - in pkg/CHNOSZ: . R data inst inst/extdata/thermo man tests/testthat vignettes

Fri Feb 24 07:46:29 CET 2017

Author: jedick
Date: 2017-02-24 07:46:29 +0100 (Fri, 24 Feb 2017)
New Revision: 174

Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/add.protein.R
   pkg/CHNOSZ/R/protein.info.R
   pkg/CHNOSZ/data/protein.csv
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/inst/extdata/thermo/obigt_check.csv
   pkg/CHNOSZ/man/add.protein.Rd
   pkg/CHNOSZ/man/protein.Rd
   pkg/CHNOSZ/man/protein.info.Rd
   pkg/CHNOSZ/man/util.fasta.Rd
   pkg/CHNOSZ/tests/testthat/test-add.protein.R
   pkg/CHNOSZ/tests/testthat/test-affinity.R
   pkg/CHNOSZ/vignettes/anintro.Rmd
   pkg/CHNOSZ/vignettes/hotspring.Rnw
   pkg/CHNOSZ/vignettes/hotspring.lyx
Log:
remove read.aa()


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/DESCRIPTION	2017-02-24 06:46:29 UTC (rev 174)
@@ -1,6 +1,6 @@
 Date: 2017-02-24
 Package: CHNOSZ
-Version: 1.0.8-63
+Version: 1.0.8-64
 Title: Chemical Thermodynamics and Activity Diagrams
 Author: Jeffrey Dick
 Maintainer: Jeffrey Dick <j3ffdick at gmail.com>

Modified: pkg/CHNOSZ/R/add.protein.R
===================================================================
--- pkg/CHNOSZ/R/add.protein.R	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/R/add.protein.R	2017-02-24 06:46:29 UTC (rev 174)
@@ -5,7 +5,6 @@
 # add.protein - add amino acid counts to thermo$protein (returns iprotein)
 # seq2aa - calculate amino acid counts from a sequence
 # aasum - combine amino acid counts (sum, average, or weighted sum by abundance)
-# read.aa - read amino acid counts from a file
 
 seq2aa <- function(protein, sequence) {
   # remove newlines and whitespace
@@ -55,21 +54,12 @@
   return(out)
 }
 
-read.aa <- function(file="protein.csv", ...) {
-  # 20090428 added colClasses here
-  # 20140128 added as.is=TRUE (in case numeric values are stored in ref or abbrv column)
-  aa <- read.csv(file, colClasses=c(rep("character", 2), NA, NA, rep("numeric", 21)), as.is=TRUE, ...)
-  if(!identical(colnames(aa), colnames(get("thermo")$protein)))
-    stop(paste("format of", file, "is incompatible with thermo$protein"))
-  return(aa)
-}
-
 add.protein <- function(aa) {
   # add a properly constructed data frame of 
   # amino acid counts to thermo$protein
   thermo <- get("thermo")
   if(!identical(colnames(aa), colnames(thermo$protein)))
-    stop("the value of 'aa' is not a data frame with the same columns as thermo$protein")
+    stop("'aa' does not have the same columns as thermo$protein")
   # find any protein IDs that are duplicated
   po <- paste(aa$protein, aa$organism, sep="_")
   ip <- pinfo(po)

Modified: pkg/CHNOSZ/R/protein.info.R
===================================================================
--- pkg/CHNOSZ/R/protein.info.R	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/R/protein.info.R	2017-02-24 06:46:29 UTC (rev 174)
@@ -8,7 +8,7 @@
 # protein.basis: coefficients of basis species in formation reactions of [ionized] proteins [residues]
 # protein.equil: step-by-step example of protein equilibrium calculation
 
-pinfo <- function(protein, organism=NULL, residue=FALSE) {
+pinfo <- function(protein, organism=NULL, residue=FALSE, regexp=FALSE) {
   # return the `protein` (possibly per residue) for:
   #   dataframe `protein`
   # return the rownumber(s) of thermo$protein for:
@@ -29,11 +29,19 @@
     # compute per-residue counts if requested
     if(residue) out[, 5:25] <- out[, 5:25]/rowSums(out[, 6:25])
   } else {
-    # search for protein or protein_organism in thermo$protein
-    t_p_names <- paste(t_p$protein, t_p$organism, sep="_")
-    if(is.null(organism)) my_names <- protein
-    else my_names <- paste(protein, organism, sep="_")
-    iprotein <- match(my_names, t_p_names)
+    # search for protein by regular expression
+    if(regexp) {
+      iprotein <- grepl(protein, t_p$protein)
+      iorganism <- iprotein
+      if(!is.null(organism)) iorganism <- grepl(organism, t_p$organism)
+      iprotein <- which(iprotein & iorganism)
+    } else {
+      # search for protein or protein_organism in thermo$protein
+      t_p_names <- paste(t_p$protein, t_p$organism, sep="_")
+      if(is.null(organism)) my_names <- protein
+      else my_names <- paste(protein, organism, sep="_")
+      iprotein <- match(my_names, t_p_names)
+    }
     out <- iprotein
   }
   out

Modified: pkg/CHNOSZ/data/protein.csv
===================================================================
--- pkg/CHNOSZ/data/protein.csv	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/data/protein.csv	2017-02-24 06:46:29 UTC (rev 174)
@@ -468,3 +468,41 @@
 PTC1,HUMAN,UniProt,Q13635,1,116,27,63,78,64,101,42,62,50,153,31,49,102,57,79,105,83,108,21,56
 SMO,HUMAN,UniProt,Q99835,1,70,26,28,36,37,57,15,33,29,71,14,25,61,26,47,52,43,53,19,18
 GLI3R,HUMAN,UniProt,P10071,1,10,0,6,4,5,5,12,7,0,7,5,3,26,0,7,17,6,3,0,8
+HXA1,HUMAN,UniProt,P49639,1,25,7,9,17,9,26,24,7,14,19,5,16,25,20,14,46,20,14,2,16
+HXA2,HUMAN,UniProt,O43364,1,31,8,16,29,16,22,10,9,20,38,4,18,30,23,15,42,24,13,2,6
+HXA3,HUMAN,UniProt,O43365,1,49,5,8,15,8,44,18,6,19,28,8,17,75,27,15,49,20,9,2,21
+HXA4,HUMAN,UniProt,Q00056,1,37,3,4,14,6,28,15,7,16,21,5,8,52,19,20,24,14,10,3,14
+HXA5,HUMAN,UniProt,P20719,1,30,3,10,14,8,25,12,8,10,11,7,10,20,11,21,41,8,5,3,13
+HXA6,HUMAN,UniProt,P31267,1,16,5,10,12,10,20,4,5,13,16,4,9,15,15,17,28,10,4,3,17
+HXA7,HUMAN,UniProt,P31268,1,33,5,13,20,9,18,4,6,11,14,3,10,12,8,15,18,11,4,3,13
+HXA9,HUMAN,UniProt,P31269,1,33,4,14,15,9,17,12,4,14,21,5,13,26,8,19,17,15,11,4,11
+HXA10,HUMAN,UniProt,P31260,1,43,9,14,22,12,59,5,6,19,32,6,13,51,16,22,43,14,7,3,14
+HXA11,HUMAN,UniProt,P31270,1,32,6,12,21,12,20,6,5,15,18,5,12,25,12,24,39,18,13,2,16
+HXA13,HUMAN,UniProt,P31271,1,93,6,8,17,10,35,11,6,19,19,10,13,37,13,16,29,12,13,5,16
+HXB1,HUMAN,UniProt,P14653,1,29,5,9,19,11,29,4,2,12,16,4,11,39,15,15,37,17,8,2,17
+HXB2,HUMAN,UniProt,P14652,1,39,9,14,29,19,27,4,7,12,35,3,6,53,16,20,33,14,11,3,2
+HXB3,HUMAN,UniProt,P14651,1,42,7,8,15,10,64,15,4,22,28,10,20,60,21,13,46,19,6,2,19
+HXB4,HUMAN,UniProt,P17483,1,22,7,5,13,6,19,7,5,11,13,4,9,46,10,23,21,6,10,3,11
+HXB5,HUMAN,UniProt,P09067,1,30,3,9,15,11,16,7,7,10,16,8,11,19,12,19,46,13,3,3,11
+HXB6,HUMAN,UniProt,P17509,1,18,6,6,20,10,17,4,4,12,14,4,5,19,13,17,23,10,4,3,15
+HXB7,HUMAN,UniProt,P09629,1,26,4,5,18,10,22,3,5,10,11,6,8,10,11,17,20,13,3,3,12
+HXB9,HUMAN,UniProt,P17482,1,21,3,6,22,7,17,7,6,16,20,5,8,24,14,17,26,8,8,4,11
+HXB13,HUMAN,UniProt,Q92826,1,33,7,9,14,8,26,5,6,16,19,4,8,33,13,16,22,14,12,4,15
+HXC4,HUMAN,UniProt,P09017,1,20,5,6,18,4,12,14,10,12,13,5,9,35,16,20,31,12,5,3,14
+HXC5,HUMAN,UniProt,Q00444,1,25,4,6,14,5,12,6,8,14,13,8,12,20,12,17,20,9,3,3,11
+HXC6,HUMAN,UniProt,P09630,1,14,3,8,17,9,18,5,9,11,14,5,15,8,17,20,24,15,7,3,13
+HXC8,HUMAN,UniProt,P31273,1,13,4,7,26,11,19,10,3,16,15,4,13,14,15,15,26,6,9,3,13
+HXC9,HUMAN,UniProt,P31274,1,24,4,15,14,9,15,7,4,16,18,7,8,27,7,20,24,12,11,4,14
+HXC10,HUMAN,UniProt,Q9NYD6,1,25,9,11,32,9,20,5,6,24,26,8,18,30,10,23,42,18,9,3,14
+HXC11,HUMAN,UniProt,O43248,1,28,8,7,25,15,21,8,5,17,19,6,17,28,9,21,35,8,11,2,14
+HXC12,HUMAN,UniProt,P31275,1,18,5,11,17,11,42,3,4,12,32,2,14,30,9,21,26,4,8,3,10
+HXC13,HUMAN,UniProt,P31276,1,28,7,11,16,7,37,11,7,19,23,4,5,36,13,17,37,12,18,5,17
+HXD1,HUMAN,UniProt,Q9GZZ0,1,48,6,10,13,17,33,5,6,17,25,3,8,36,12,14,34,14,14,2,11
+HXD3,HUMAN,UniProt,P31249,1,44,9,9,19,11,42,15,7,22,29,9,18,59,26,15,48,18,11,2,19
+HXD4,HUMAN,UniProt,P09016,1,18,4,8,12,7,27,9,4,15,15,6,7,33,15,15,24,10,10,3,13
+HXD8,HUMAN,UniProt,P13378,1,36,5,8,21,12,24,11,6,15,14,4,11,37,18,18,17,10,7,3,13
+HXD9,HUMAN,UniProt,P28356,1,43,7,7,20,11,46,6,7,17,17,8,9,36,13,21,43,17,10,4,10
+HXD10,HUMAN,UniProt,P28358,1,16,9,8,30,9,12,3,9,23,24,14,20,27,19,20,41,22,20,3,11
+HXD11,HUMAN,UniProt,P31277,1,48,6,11,19,16,50,2,4,16,16,6,10,41,15,21,23,7,10,2,15
+HXD12,HUMAN,UniProt,P35452,1,39,4,7,13,10,24,0,4,16,28,3,11,28,15,19,17,10,9,3,10
+HXD13,HUMAN,UniProt,P35453,1,54,5,11,13,11,32,7,7,18,14,7,11,23,13,19,45,12,20,5,16

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/inst/NEWS	2017-02-24 06:46:29 UTC (rev 174)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 1.0.8-63 (2017-02-24)
+CHANGES IN CHNOSZ 1.0.8-64 (2017-02-24)
 ---------------------------------------
 
 DOCUMENTATION:
@@ -64,9 +64,6 @@
   marks and lines; this is used in diagram() to redraw the axes on
   filled diagrams.
 
-- Add `...` argument to read.aa() (additional arguments for
-  read.csv()).
-
 - seq2aa() removes newlines and whitespace before counting the
   letters in the sequence.
 
@@ -136,7 +133,8 @@
   summary table).
 
 - New function pinfo() merges functionality of old iprotein() and
-  ip2aa(), which have been removed (along with protein.info()).
+  ip2aa(). Add `regexp` argument to control whether matches are made
+  using a regular expression.
 
 - Rename aa2eos() to protein.obigt().
 
@@ -145,6 +143,8 @@
 - Remove stress() and stress.csv; move data from Tai et al., 2005 (used
   in an example in ?read.expr) to TBD+05.csv.
 
+- Remove read.aa() - replaced by read.csv() with as.is=TRUE.
+
 CHANGES IN CHNOSZ 1.0.8 (2016-05-28)
 ------------------------------------
 

Modified: pkg/CHNOSZ/inst/extdata/thermo/obigt_check.csv
===================================================================
--- pkg/CHNOSZ/inst/extdata/thermo/obigt_check.csv	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/inst/extdata/thermo/obigt_check.csv	2017-02-24 06:46:29 UTC (rev 174)
@@ -210,34 +210,34 @@
 "OBIGT",1793,"Gly-Tyr-Gly","aq",,-190.24,
 "OBIGT",1794,"Gly-Val-Gly","aq",,-155.72,
 "OBIGT",1795,"[GXGBB]","aq",,-98.93,
-"OBIGT",1833,"methyldiethanolamine","aq",1.61,,
-"OBIGT",1865,"MgAsO4-","aq",1.3,,
-"OBIGT",1868,"MnAsO4-","aq",-1.45,,
-"OBIGT",1964,"antigorite","cr1",,,812
-"OBIGT",1998,"clinochlore,7a","cr1",,,666
-"OBIGT",2017,"daphnite,14a","cr",,,-836
-"OBIGT",2042,"ferrosilite","cr1",,,694
-"OBIGT",2043,"ferrosilite","cr2",,,694
-"OBIGT",2058,"greenalite","cr",,,142507
-"OBIGT",2073,"hydromagnesite","cr",,,-2569
-"OBIGT",2204,"n-octadecane","cr",-2.63,,
-"OBIGT",2205,"n-nonadecane","cr",-13.32,,
-"OBIGT",2206,"n-eicosane","cr",-2.79,,
-"OBIGT",2207,"n-heneicosane","cr",-8.61,,
-"OBIGT",2208,"n-docosane","cr",-2.63,,
-"OBIGT",2209,"n-tricosane","cr",-5.22,,
-"OBIGT",2210,"n-tetracosane","cr",-2.02,,
-"OBIGT",2211,"n-pentacosane","cr",-2.93,,
-"OBIGT",2212,"n-hexacosane","cr",-1.29,,
-"OBIGT",2213,"n-heptacosane","cr",-1.23,,
-"OBIGT",2265,"carbazole","cr",-43.39,,
-"OBIGT",2306,"triphenylene","cr",,,541
-"OBIGT",2619,"deoxyadenosine","cr",,,-2977
-"OBIGT",2626,"acetamide","cr",-67.91,,
-"OBIGT",2671,"jarosite","cr",,,20697
-"OBIGT",2672,"natrojarosite","cr",,,17554
-"OBIGT",2744,"n-nonacontane","liq",,,635
-"OBIGT",2751,"2-methyloctane","liq",10,,
-"OBIGT",3164,"5,6-dithiadecane","liq",2,,
-"OBIGT",3239,"ethylene","gas",-4.59,,
-"OBIGT",3249,"3,5-dimethylphenol","gas",,,628
+"OBIGT",1839,"methyldiethanolamine","aq",1.61,,
+"OBIGT",1871,"MgAsO4-","aq",1.3,,
+"OBIGT",1874,"MnAsO4-","aq",-1.45,,
+"OBIGT",1970,"antigorite","cr1",,,812
+"OBIGT",2004,"clinochlore,7a","cr1",,,666
+"OBIGT",2023,"daphnite,14a","cr",,,-836
+"OBIGT",2048,"ferrosilite","cr1",,,694
+"OBIGT",2049,"ferrosilite","cr2",,,694
+"OBIGT",2064,"greenalite","cr",,,142507
+"OBIGT",2079,"hydromagnesite","cr",,,-2569
+"OBIGT",2210,"n-octadecane","cr",-2.63,,
+"OBIGT",2211,"n-nonadecane","cr",-13.32,,
+"OBIGT",2212,"n-eicosane","cr",-2.79,,
+"OBIGT",2213,"n-heneicosane","cr",-8.61,,
+"OBIGT",2214,"n-docosane","cr",-2.63,,
+"OBIGT",2215,"n-tricosane","cr",-5.22,,
+"OBIGT",2216,"n-tetracosane","cr",-2.02,,
+"OBIGT",2217,"n-pentacosane","cr",-2.93,,
+"OBIGT",2218,"n-hexacosane","cr",-1.29,,
+"OBIGT",2219,"n-heptacosane","cr",-1.23,,
+"OBIGT",2271,"carbazole","cr",-43.39,,
+"OBIGT",2312,"triphenylene","cr",,,541
+"OBIGT",2625,"deoxyadenosine","cr",,,-2977
+"OBIGT",2632,"acetamide","cr",-67.91,,
+"OBIGT",2677,"jarosite","cr",,,20697
+"OBIGT",2678,"natrojarosite","cr",,,17554
+"OBIGT",2750,"n-nonacontane","liq",,,635
+"OBIGT",2757,"2-methyloctane","liq",10,,
+"OBIGT",3170,"5,6-dithiadecane","liq",2,,
+"OBIGT",3245,"ethylene","gas",-4.59,,
+"OBIGT",3255,"3,5-dimethylphenol","gas",,,628

Modified: pkg/CHNOSZ/man/add.protein.Rd
===================================================================
--- pkg/CHNOSZ/man/add.protein.Rd	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/man/add.protein.Rd	2017-02-24 06:46:29 UTC (rev 174)
@@ -1,7 +1,6 @@
 \name{add.protein}
 \alias{add.protein}
 \alias{seq2aa}
-\alias{read.aa}
 \alias{aasum}
 \title{Amino Acid Compositions of Proteins}
 \description{
@@ -11,7 +10,6 @@
 \usage{
   add.protein(aa)
   seq2aa(protein, sequence)
-  read.aa(file = "protein.csv", ...)
   aasum(aa, abundance = 1, average = FALSE, protein = NULL, organism = NULL)
 }
 
@@ -19,7 +17,6 @@
   \item{aa}{data frame, amino acid composition in the format of \code{thermo$protein}}
   \item{protein}{character, name of protein; numeric, indices of proteins (rownumbers of \code{\link{thermo}$protein})}
   \item{sequence}{character, protein sequence}
-  \item{file}{character, path to file with amino acid compositions}
   \item{...}{additional arguments passed to \code{\link{read.csv}}}
   \item{abundance}{numeric, abundances of proteins}
   \item{average}{logical, return the weighted average of amino acid counts?}
@@ -41,8 +38,6 @@
 If \code{average} is TRUE the final sum is divided by the number of input compositions.
 The name used in the output is taken from the first row of \code{aa} or from \code{protein} and \code{organism} if they are specified.
 
-\code{read.aa} returns a data frame of amino acid composition based on the contents of the indicated \code{file}, which should be a CSV file with the same column names as \code{thermo$protein}.
-
 Given amino acid composition returned by the \code{*aa} functions described above, \code{add.protein} adds them to \code{thermo$protein} for use by other functions in CHNOSZ.
 The amino acid compositions of proteins in \code{aa} with the same name as one in \code{thermo$protein} are replaced.
 The value returned by this function is the rownumbers of \code{thermo$protein} that are added and/or replaced.

Modified: pkg/CHNOSZ/man/protein.Rd
===================================================================
--- pkg/CHNOSZ/man/protein.Rd	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/man/protein.Rd	2017-02-24 06:46:29 UTC (rev 174)
@@ -18,7 +18,8 @@
 ## logfO2-pH potential diagram
 # with a charged basis, we calculate properties of ionized proteins
 basis("CHNOS+")
-aa <- read.aa(system.file("extdata/protein/DS11.csv", package = "CHNOSZ"))
+file <- system.file("extdata/protein/DS11.csv", package = "CHNOSZ")
+aa <- read.csv(file, as.is=TRUE)
 aa <- aa[grep("transferase", aa$protein), ]
 ip <- add.protein(aa)
 a <- affinity(pH=c(0, 14), O2=c(-64, -61), T=75, iprotein=ip)

Modified: pkg/CHNOSZ/man/protein.info.Rd
===================================================================
--- pkg/CHNOSZ/man/protein.info.Rd	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/man/protein.info.Rd	2017-02-24 06:46:29 UTC (rev 174)
@@ -14,7 +14,7 @@
 }
 
 \usage{
-  pinfo(protein, organism=NULL, residue=FALSE)
+  pinfo(protein, organism=NULL, residue=FALSE, regexp=FALSE)
   protein.length(protein, organism = NULL)
   protein.formula(protein, organism = NULL, residue = FALSE)
   protein.obigt(protein, organism = NULL, state=get("thermo")$opt$state)
@@ -26,6 +26,7 @@
   \item{protein}{character, names of proteins; numeric, species index of proteins; data frame; amino acid composition of proteins}
   \item{organism}{character, names of organisms}
   \item{residue}{logical, return per-residue values (those of the proteins divided by their lengths)?}
+  \item{regexp}{logical, find matches using regular expressions?}
   \item{normalize}{logical, return per-residue values (those of the proteins divided by their lengths)?}
   \item{state}{character, physical state}
   \item{T}{numeric, temperature in \eqn{^{\circ}}{°}C}
@@ -38,6 +39,9 @@
 The names can be supplied in the single \code{protein} argument (with an underscore, denoting protein_organism) or as pairs of \code{protein}s and \code{organism}s.
 NA is returned for any unmatched proteins, including those for which no \code{organism} is given or that do not have an underscore in \code{protein}.
 
+Alternatively, if \code{regexp} is TRUE, the \code{protein} argument is used as a pattern (regular expression); rownumbers of all matches of \code{thermo$protein$protein} to this pattern are returned.
+When using \code{regexp}, the \code{organism} can optionally be provided to return only those entries that also match \code{thermo$protein$organism}.
+
 For numeric \code{protein}, \code{pinfo} returns the corresponding row(s) of \code{thermo$protein}.
 Set \code{residue} to TRUE to return the per-residue composition (i.e. amino acid composition of the protein divided by total number of residues).
 
@@ -115,7 +119,7 @@
 # get amino acid compositions of microbial proteins 
 # generated from the RefSeq database 
 file <- system.file("extdata/refseq/protein_refseq.csv.xz", package="CHNOSZ")
-ip <- add.protein(read.aa(file))
+ip <- add.protein(read.csv(file, as.is=TRUE))
 # only use those organisms with a certain
 # number of sequenced bases
 ip <- ip[as.numeric(thermo$protein$abbrv[ip]) > 50000]
@@ -139,7 +143,42 @@
 axis(1, 1:15, terms, las=2)
 title(main=paste("Average oxidation state of carbon in proteins",
   "by taxID in NCBI RefSeq (after Dick, 2014)", sep="\n"))
+
+\dontshow{opar <- par(no.readonly=TRUE)}
+# using pinfo() with regexp=TRUE:
+# plot ZC and nH2O/residue of HOX proteins
+# basis species: glutamine-glutamic acid-cysteine-O2-H2O
+basis("QEC")
+# device setup
+par(mfrow=c(2, 2))
+# a red-blue scale from 1-13
+col <- ZC.col(1:13)
+# axis labels
+ZClab <- axis.label("ZC")
+nH2Olab <- expression(bar(italic(n))[H[2]*O])
+# loop over HOX gene clusters
+for(cluster in c("A", "B", "C", "D")) {
+  # get protein indices
+  pattern <- paste0("^HX", cluster)
+  ip <- pinfo(pattern, "HUMAN", regexp=TRUE)
+  # calculate ZC and nH2O/residue
+  thisZC <- ZC(protein.formula(ip))
+  thisH2O <- protein.basis(ip)[, "H2O"] / protein.length(ip)
+  # plot lines
+  plot(thisZC, thisH2O, type="l", xlab=ZClab, ylab=nH2Olab)
+  # the number of the HOX gene
+  pname <- pinfo(ip)$protein
+  nHOX <- as.numeric(gsub("[A-Za-z]*", "", pname))
+  # plot colored points
+  points(thisZC, thisH2O, pch=19, col=col[nHOX], cex=3.5)
+  points(thisZC, thisH2O, pch=19, col="white", cex=2.5)
+  # plot the number of the HOX gene
+  text(thisZC, thisH2O, nHOX)
+  # add title
+  title(main=paste0("HOX", cluster))
 }
+\dontshow{par(opar)}
+}
 
 \references{
   Dick, J. M., LaRowe, D. E. and Helgeson, H. C. (2006) Temperature, pressure, and electrochemical constraints on protein speciation: Group additivity calculation of the standard molal thermodynamic properties of ionized unfolded proteins. \emph{Biogeosciences} \bold{3}, 311--336. \url{http://dx.doi.org/10.5194/bg-3-311-2006}

Modified: pkg/CHNOSZ/man/util.fasta.Rd
===================================================================
--- pkg/CHNOSZ/man/util.fasta.Rd	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/man/util.fasta.Rd	2017-02-24 06:46:29 UTC (rev 174)
@@ -105,7 +105,7 @@
 # the amino acid composition can be saved for future use
 write.csv(aa, "saved.aa.csv", row.names=FALSE)
 # in another R session, the protein can be loaded without using uniprot.aa()
-aa <- read.aa("saved.aa.csv")
+aa <- read.csv("saved.aa.csv", as.is=TRUE)
 add.protein(aa)
 
 ## count amino acids in a sequence

Modified: pkg/CHNOSZ/tests/testthat/test-add.protein.R
===================================================================
--- pkg/CHNOSZ/tests/testthat/test-add.protein.R	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/tests/testthat/test-add.protein.R	2017-02-24 06:46:29 UTC (rev 174)
@@ -11,12 +11,12 @@
   ip <- add.protein(aa)
   # the replaces the proteins (with the same ones)
   expect_error(ip <- add.protein(aa), "converting factors causes problems replacing protein data")
-  # ... should use read.csv(file, stringsAsFactors=FALSE)
+  # ... should use read.csv(file, as.is=TRUE)
 })
 
 test_that("errors and messages occur in some circumstances", {
   expect_error(seq2aa("LYS_CHICK", "XXX"), "no characters match an amino acid")
-  expect_error(add.protein(count.aa("AAA")), "not a data frame with the same columns as thermo\\$protein")
+  expect_error(add.protein(count.aa("AAA")), "does not have the same columns as thermo\\$protein")
   expect_message(add.protein(pinfo(pinfo("CYC_BOVIN"))), "replaced 1 existing protein\\(s\\)")
 })
 

Modified: pkg/CHNOSZ/tests/testthat/test-affinity.R
===================================================================
--- pkg/CHNOSZ/tests/testthat/test-affinity.R	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/tests/testthat/test-affinity.R	2017-02-24 06:46:29 UTC (rev 174)
@@ -101,7 +101,8 @@
   basis(c("HCO3-", "H2O", "NH3", "HS-", "H2", "H+"),
     "aq", c(-3, 0, -4, -7, 999, 999))
   sites <- c("N", "S", "R", "Q", "P")
-  aa <- read.aa(system.file("extdata/protein/DS11.csv", package="CHNOSZ"))
+  file <- system.file("extdata/protein/DS11.csv", package="CHNOSZ")
+  aa <- read.csv(file, as.is=TRUE)
   ip <- add.protein(aa[1:5, ])
   # to reproduce, we need use the "old" parameters for [Met] from Dick et al., 2006
   mod.obigt("[Met]", G=-35245, H=-59310)

Modified: pkg/CHNOSZ/vignettes/anintro.Rmd
===================================================================
--- pkg/CHNOSZ/vignettes/anintro.Rmd	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/vignettes/anintro.Rmd	2017-02-24 06:46:29 UTC (rev 174)
@@ -1350,22 +1350,23 @@
 In the Rubisco example above, we saw the use of <span style="color:green">`read.fasta()`</span> to read amino acid sequences from a FASTA file.
 There are several other methods for inputting amino acid compositions.
 
+R's `read.csv()` can be used to read amino acid compositions from a CSV file with the same columns that are present in `thermo$protein`.
+Note the use of `as.is = TRUE` to prevent reading character data as factors.
+The `nrows` argument can be added to read that number of rows:
+```{r read_csv}
+file <- system.file("extdata/protein/DS11.csv", package = "CHNOSZ")
+aa_bison <- read.csv(file, as.is = TRUE, nrows = 5)
+```
 <span style="color:green">`more.aa()`</span> retrieves amino acid composition of proteins in *Saccharomyces cerevisiae* and *Escherichia coli* from data files that are included with CHNOSZ:
 ```{r more_aa}
 aa_YML020W <- more.aa("YML020W", "Sce")
 aa_ILVE <- more.aa("ILVE", "Eco")
 ```
-<span style="color:green">`read.aa()`</span> is used to read amino acid compositions from a CSV file with the same columns that are present in `thermo$protein`.
-The `nrows` argument can be added to read that number of rows:
-```{r read_aa}
-aa_bison <- read.aa(system.file("extdata/protein/DS11.csv",
-                                package = "CHNOSZ"), nrows = 5)
-```
 <span style="color:green">`read.fasta()`</span> reads a FASTA file and returns the amino acid compositions of the sequences.
 The `iseq` argument can be used to read those sequences from the file:
 ```{r read_fasta, message=FALSE}
-aa_Ef <- read.fasta(system.file("extdata/fasta/EF-Tu.aln",
-                                package = "CHNOSZ"), iseq = 1:2)
+file <- system.file("extdata/fasta/EF-Tu.aln", package = "CHNOSZ")
+aa_Ef <- read.fasta(file, iseq = 1:2)
 ```
 <span style="color:green">`seq2aa()`</span> counts the amino acids in a user-supplied sequence and generates a data frame of the amino acid composition:
 ```{marginfigure}
@@ -1476,7 +1477,8 @@
 Then we add the proteins and get their indices using <span style="color:red">`add.protein()`</span>, set the basis, calculate the affinities, and make a potential diagram with temperature and activity of dissolved hydrogen as variables:
 
 ```{r bison_transferase, fig.margin=TRUE, fig.width=4, fig.height=4, small.mar=TRUE, dpi=dpi, out.width="100%", echo=FALSE, results="hide", message=FALSE, fig.cap='Potential diagram for metagenomically identified sequences of transferases in Bison Pool hot spring. See also the vignette [<span style="color:blue">*Hot-spring proteins in CHNOSZ*</span>](hotspring.pdf).', cache=TRUE, pngquant=pngquant, timeit=timeit}
-aa <- read.aa(system.file("extdata/protein/DS11.csv", package = "CHNOSZ"))
+file <- system.file("extdata/protein/DS11.csv", package = "CHNOSZ")
+aa <- read.csv(file, as.is = TRUE)
 aa <- aa[grep("transferase", aa$protein), ]
 ip <- add.protein(aa)
 bspecies <- c("HCO3-", "H2O", "NH3", "HS-", "H2", "H+")
@@ -1492,12 +1494,12 @@
 lines(T, logaH2, lty = 2, lwd = 2)
 points(T, logaH2, pch = 21, bg = "white", cex = 1.5)
 ```
-```{r bison_transferase, eval=FALSE, echo=1:11}
+```{r bison_transferase, eval=FALSE, echo=1:12}
 ```
 Site numbers 1--5 correspond to a cooling gradient along the outflow channel of the hot spring.
 The colors represent the relative `r zc` of the proteins (red is more reduced).
 The points indicate the *T* and log*a*<sub>H<sub>2</sub></sub> that optimize a thermodynamic model for relative abundances of phyla that were estimated by taxonomic classification of metagenomic sequences [@DS13]:
-```{r bison_transferase, eval=FALSE, echo=12:15}
+```{r bison_transferase, eval=FALSE, echo=13:16}
 ```
 
 # Experimental features

Modified: pkg/CHNOSZ/vignettes/hotspring.Rnw
===================================================================
--- pkg/CHNOSZ/vignettes/hotspring.Rnw	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/vignettes/hotspring.Rnw	2017-02-24 06:46:29 UTC (rev 174)
@@ -132,8 +132,8 @@
 
 <<proteins>>=
 # read the amino acid compositions
-aa.annot <- read.aa(system.file("extdata/protein/DS11.csv", package="CHNOSZ"))
-aa.phyla <- read.aa(system.file("extdata/protein/DS13.csv", package="CHNOSZ"))
+aa.annot <- read.csv(system.file("extdata/protein/DS11.csv", package="CHNOSZ"), as.is=TRUE)
+aa.phyla <- read.csv(system.file("extdata/protein/DS13.csv", package="CHNOSZ"), as.is=TRUE)
 @
 
 Here are the site names for the sampling locations (also referred

Modified: pkg/CHNOSZ/vignettes/hotspring.lyx
===================================================================
--- pkg/CHNOSZ/vignettes/hotspring.lyx	2017-02-24 02:07:50 UTC (rev 173)
+++ pkg/CHNOSZ/vignettes/hotspring.lyx	2017-02-24 06:46:29 UTC (rev 174)
@@ -482,12 +482,14 @@
 
 \begin_layout Plain Layout
 
-aa.annot <- read.aa(system.file("extdata/protein/DS11.csv", package="CHNOSZ"))
+aa.annot <- read.csv(system.file("extdata/protein/DS11.csv", package="CHNOSZ"),
+ as.is=TRUE)
 \end_layout
 
 \begin_layout Plain Layout
 
-aa.phyla <- read.aa(system.file("extdata/protein/DS13.csv", package="CHNOSZ"))
+aa.phyla <- read.csv(system.file("extdata/protein/DS13.csv", package="CHNOSZ"),
+ as.is=TRUE)
 \end_layout
 
 \begin_layout Plain Layout