[CHNOSZ-commits] r23 - in pkg/CHNOSZ: . R inst/doc man vignettes

Tue Sep 25 04:38:34 CEST 2012

Author: jedick
Date: 2012-09-25 04:38:33 +0200 (Tue, 25 Sep 2012)
New Revision: 23

Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/more.aa.R
   pkg/CHNOSZ/R/read.expr.R
   pkg/CHNOSZ/inst/doc/anintro.pdf
   pkg/CHNOSZ/man/extdata.Rd
   pkg/CHNOSZ/man/more.aa.Rd
   pkg/CHNOSZ/man/read.expr.Rd
   pkg/CHNOSZ/vignettes/anintro.Rnw
   pkg/CHNOSZ/vignettes/anintro.lyx
Log:
yeastgfp() accepts multiple locations and more.aa() accepts list


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2012-09-24 14:13:01 UTC (rev 22)
+++ pkg/CHNOSZ/DESCRIPTION	2012-09-25 02:38:33 UTC (rev 23)
@@ -1,4 +1,4 @@
-Date: 2012-09-24
+Date: 2012-09-25
 Package: CHNOSZ
 Version: 0.9-7.98
 Title: Chemical Thermodynamics and Activity Diagrams

Modified: pkg/CHNOSZ/R/more.aa.R
===================================================================
--- pkg/CHNOSZ/R/more.aa.R	2012-09-24 14:13:01 UTC (rev 22)
+++ pkg/CHNOSZ/R/more.aa.R	2012-09-25 02:38:33 UTC (rev 23)
@@ -27,31 +27,43 @@
     # which columns have the amino acids in the order of thermo$protein 
     iaa <- 1:20 + 5
   }
-  # find the matches
-  icols <- match(searchcols, colnames(mydata))
-  imatch <- match(protein, mydata[, icols[1]])
-  imatch2 <- match(protein, mydata[, icols[2]])
-  # use not-NA matches for "abbrv" in Eco.csv
-  imatch[!is.na(imatch2)] <- imatch2[!is.na(imatch2)]
-  # report and remember the unsuccessful matches
-  if(all(is.na(imatch))) stop("no proteins found!")
-  inotmatch <- which(is.na(imatch)) 
-  if(length(inotmatch) > 0) {
-    if(length(inotmatch)==1) verb <- " was" else verb <- " were"
-    msgout("more.aa: ", paste(protein[inotmatch], collapse=" "), verb, " not matched\n")
+  # iterate over a list
+  waslist <- TRUE
+  out <- list()
+  if(!is.list(protein)) {
+    waslist <- FALSE
+    protein <- list(protein)
   }
-  aa <- data.frame(mydata[imatch, iaa])
-  # add the identifying columns
-  organism <- rep(organism, length(protein))
-  abbrv <- rep(NA, length(protein))
-  ref <- rep(NA, length(protein))
-  chains <- rep(1, length(protein))
-  chains[inotmatch] <- NA
-  precols <- data.frame(protein, organism, ref, abbrv, chains, stringsAsFactors=FALSE)
-  colnames(aa) <- aminoacids(3)
-  aa <- cbind(precols, aa)
+  for(i in 1:length(protein)) {
+    # find the matches
+    icols <- match(searchcols, colnames(mydata))
+    imatch <- match(protein[[i]], mydata[, icols[1]])
+    imatch2 <- match(protein[[i]], mydata[, icols[2]])
+    # use not-NA matches for "abbrv" in Eco.csv
+    imatch[!is.na(imatch2)] <- imatch2[!is.na(imatch2)]
+    # report and remember the unsuccessful matches
+    if(all(is.na(imatch))) stop("no proteins found!")
+    inotmatch <- which(is.na(imatch)) 
+    if(length(inotmatch) > 0) {
+      if(length(inotmatch)==1) verb <- " was" else verb <- " were"
+      msgout("more.aa: ", paste(protein[[i]][inotmatch], collapse=" "), verb, " not matched\n")
+    }
+    aa <- data.frame(mydata[imatch, iaa])
+    # add the identifying columns
+    organism <- rep(organism[[1]], length(protein[[i]]))
+    abbrv <- rep(NA, length(protein[[i]]))
+    ref <- rep(NA, length(protein[[i]]))
+    chains <- rep(1, length(protein[[i]]))
+    chains[inotmatch] <- NA
+    precols <- data.frame(protein[[i]], organism, ref, abbrv, chains, stringsAsFactors=FALSE)
+    colnames(precols)[1] <- "protein"
+    colnames(aa) <- aminoacids(3)
+    aa <- cbind(precols, aa)
+    out <- c(out, list(aa))
+  }
   # done!
-  return(aa)
+  if(!waslist) return(out[[1]])
+  else return(out)
 }
 
 

Modified: pkg/CHNOSZ/R/read.expr.R
===================================================================
--- pkg/CHNOSZ/R/read.expr.R	2012-09-24 14:13:01 UTC (rev 22)
+++ pkg/CHNOSZ/R/read.expr.R	2012-09-25 02:38:33 UTC (rev 23)
@@ -39,22 +39,31 @@
       length(ygfp$abundance[!is.na(ygfp$abundance)]), " abundances\n")
     return(invisible(colnames(ygfp)[6:28]))
   }
-  # what location do we want?
-  ncol <- match(location, colnames(ygfp)[6:28]) + 5
-  if(is.na(ncol)) ncol <- agrep(location, colnames(ygfp)[6:28])[1] + 5
-  if(is.na(ncol)) stop(paste(location, "is not one of the subcellular locations in", ypath))
-  thisygfp <- ygfp[, ncol]
-  if(exclusive) {
-    # find the number of localizations of each ORF
-    localizations <- numeric(nrow(ygfp))
-    for(i in 6:28) localizations <- localizations + as.logical(ygfp[,i])
-    if(all(localizations[thisygfp] > 1)) msgout("yeastgfp: no exclusive localization found for ",location,
-      " ... using non-exclusive localizations\n",sep="")
-    else thisygfp <- thisygfp & ! localizations > 1
+  # iterate over multiple locations
+  out <- list()
+  for(i in 1:length(location)) {
+    # what location do we want?
+    ncol <- match(location[i], colnames(ygfp)[6:28]) + 5
+    if(is.na(ncol)) ncol <- agrep(location[i], colnames(ygfp)[6:28])[1] + 5
+    if(is.na(ncol)) stop(paste(location[i], "is not one of the subcellular locations in", ypath))
+    thisygfp <- ygfp[, ncol]
+    if(exclusive) {
+      # find the number of localizations of each ORF
+      localizations <- numeric(nrow(ygfp))
+      for(j in 6:28) localizations <- localizations + as.logical(ygfp[,j])
+      if(all(localizations[thisygfp] > 1)) msgout("yeastgfp: no exclusive localization found for ",location[i],
+        " ... using non-exclusive localizations\n",sep="")
+      else thisygfp <- thisygfp & ! localizations > 1
+    }
+    protein <- as.character(ygfp$yORF[thisygfp])
+    abundance <- ygfp$abundance[thisygfp]
+    if(length(location)==1) out <- list(protein=protein, abundance=abundance)
+    else {
+      out$protein <- c(out$protein, list(protein))
+      out$abundance <- c(out$abundance, list(abundance))
+    }
   }
-  protein <- as.character(ygfp$yORF[thisygfp])
-  abundance <- ygfp$abundance[thisygfp]
-  return(list(protein=protein, abundance=abundance))
+  return(out)
 }
 
 read.expr <- function(file, idcol, abundcol, filter=NULL) {

Modified: pkg/CHNOSZ/inst/doc/anintro.pdf
===================================================================
(Binary files differ)

Modified: pkg/CHNOSZ/man/extdata.Rd
===================================================================
--- pkg/CHNOSZ/man/extdata.Rd	2012-09-24 14:13:01 UTC (rev 22)
+++ pkg/CHNOSZ/man/extdata.Rd	2012-09-25 02:38:33 UTC (rev 23)
@@ -12,8 +12,8 @@
   Files in \code{abundance} contain protein abundance data:
   \itemize{
     \item \code{stress} is a data frame listing proteins identified in selected proteomic stress response experiments. The names of proteins begin at row 3, and columns are all the same length (padded as necessary at the bottom by \code{NA}s). Names correspond to ordered locus names (for \samp{Sce}) or gene names (for \samp{Eco}). The column names identify the experiments, the first row contains the name of the organism (\samp{Sce} or \samp{Eco}) and the third row has the reference key for the source of the data (listed in \code{\link{thermo}$refs}).
-    \item \code{AA03.csv} has reference abundances for 71 proteins taken from Fig. 3 of Anderson and Anderson, 2002 (as corrected in Anderson and Anderson, 2003). The columns with data taken from these sources are type (hemoglobin, plasma, tissue, or interleukin), description (name used in the original figure), log10(pg/ml) (\emph{upper limit} of abundance interval shown in Anderson and Anderson, 2003, log10 of concentration in pg/ml). The additional columns are data derived from a search of the SWISS-PROT/UniProtKB database based on the descriptions of the proteins: name (nominal UniProtKB name for this protein), name2 (other UniProtKB names(s) that could apply to the protein), and note (notes based on searching for a protein of this description). The amino acid compositions of all proteins whose names are not NA are included in \code{thermo$protein}. The \code{abbrv} column for the proteins contains the description given by Anderson and Anderson, 2003, followed by (in parentheses) the UniProtKB accession number. Annotated initiator methionines (e.g. for ferritin, myoglobin, ENOG), signal peptides or propeptides were removed from the proteins (except where they are not annotated in UniProtKB: IGHG1, IGHA1, IGHD, MBP). In cases were multiple isoforms are present in UniProtKB (e.g. Albumin) only the first isoform was taken. In the case of C4 Complement (CO4A) and C5 Complement (CO5), the amino acid composition of only the alpha chains are listed. In the case of the protein described as iC3b, the amino acid sequence is taken to be that of Complement C3c alpha' chain fragment 1 from CO3, and is given the name CO3.C3c. The non-membrane (soluble) chains of TNF-binding protein (TNR1A) and TNF-alpha (TNFA) were used. Rantes, MIP-1 beta and MIP-1 alpha were taken from C-C motif chemokines (CCL5, CCL4, CCL3 respectively). C-peptide was taken from the corresponding annotation for insulin and here is named INS.C. See the \sQuote{protactiv} vignette for an example that uses this file.
-    \item \code{ISR+08.csv} has columns excerpted from Additional File 2 of Ishihama et al. (2008) for protein abundances in \emph{E. coli} cytosol. The columns in this file are ID (Swiss-Prot ID), accession (Swiss-Prot accession), emPAI (exponentially modified protein abundance index), copynumber (emPAI-derived copy number/cell), GRAVY (Kyte-Doolittel), FunCat (FunCat class description), PSORT (PSORT localisation), ribosomal (yes/no). See \code{\link{read.expr}} and the \sQuote{protactiv} vignette for examples that use this file.
+    \item \code{AA03.csv} has reference abundances for 71 proteins taken from Fig. 3 of Anderson and Anderson, 2002 (as corrected in Anderson and Anderson, 2003). The columns with data taken from these sources are type (hemoglobin, plasma, tissue, or interleukin), description (name used in the original figure), log10(pg/ml) (\emph{upper limit} of abundance interval shown in Anderson and Anderson, 2003, log10 of concentration in pg/ml). The additional columns are data derived from a search of the SWISS-PROT/UniProtKB database based on the descriptions of the proteins: name (nominal UniProtKB name for this protein), name2 (other UniProtKB names(s) that could apply to the protein), and note (notes based on searching for a protein of this description). The amino acid compositions of all proteins whose names are not NA are included in \code{thermo$protein}. The \code{abbrv} column for the proteins contains the description given by Anderson and Anderson, 2003, followed by (in parentheses) the UniProtKB accession number. Annotated initiator methionines (e.g. for ferritin, myoglobin, ENOG), signal peptides or propeptides were removed from the proteins (except where they are not annotated in UniProtKB: IGHG1, IGHA1, IGHD, MBP). In cases were multiple isoforms are present in UniProtKB (e.g. Albumin) only the first isoform was taken. In the case of C4 Complement (CO4A) and C5 Complement (CO5), the amino acid composition of only the alpha chains are listed. In the case of the protein described as iC3b, the amino acid sequence is taken to be that of Complement C3c alpha' chain fragment 1 from CO3, and is given the name CO3.C3c. The non-membrane (soluble) chains of TNF-binding protein (TNR1A) and TNF-alpha (TNFA) were used. Rantes, MIP-1 beta and MIP-1 alpha were taken from C-C motif chemokines (CCL5, CCL4, CCL3 respectively). C-peptide was taken from the corresponding annotation for insulin and here is named INS.C. See \code{\link{protein}} and \code{\link{read.expr}} for examples that use this file.
+    \item \code{ISR+08.csv} has columns excerpted from Additional File 2 of Ishihama et al. (2008) for protein abundances in \emph{E. coli} cytosol. The columns in this file are ID (Swiss-Prot ID), accession (Swiss-Prot accession), emPAI (exponentially modified protein abundance index), copynumber (emPAI-derived copy number/cell), GRAVY (Kyte-Doolittel), FunCat (FunCat class description), PSORT (PSORT localisation), ribosomal (yes/no). See \code{\link{read.expr}} for examples that use this file.
 %%      \item \code{GLL+98.csv} has columns "oln" for ordered locus name and "ratio" for change in expression of yeast proteins in response to H2O2 treatment, from Godon et al., 1998. One protein, YMR108W, was listed as both induced and repressed in the original data set and is not included in this table.
     \item \code{yeastgfp.csv.xz} Has 28 columns; the names of the first five are \code{yORF}, \code{gene name}, \code{GFP tagged?}, \code{GFP visualized?}, and \code{abundance}. The remaining columns correspond to the 23 subcellular localizations considered in the YeastGFP project (Huh et al., 2003 and Ghaemmaghami et al., 2003) and hold values of either \code{T} or \code{F} for each protein. \samp{yeastgfp.csv} was downloaded on 2007-02-01 from http://yeastgfp.ucsf.edu using the Advanced Search, setting options to download the entire dataset and to include localization table and abundance, sorted by orf number. See \code{\link{yeastgfp}} for examples that use this file.
   }
@@ -44,8 +44,8 @@
 
   Files in \code{protein} contain protein composition data for model organisms:
   \itemize{
-    \item \code{Sce.csv.xz} Data frame of amino acid composition of proteins from the \emph{Saccharomyces} Genome Database. Contains twenty-two columns. Values in the first column are the rownumbers, the second column (\code{OLN}) has the ordered locus names of proteins, and the remaining twenty columns (\code{Ala}..\code{Val}) contain the numbers of the respective amino acids in each protein; the columns are arranged in alphabetical order based on the three-letter abbreviations for the amino acids. The source of data for \samp{Sce.csv} is the file \samp{protein_properties.tab} found on the FTP site of the SGD project on 2008-08-04. Blank entries were replaced with "NA" and column headings were added. See \code{\link{more.aa}} for examples that use this file.
-    \item \code{Eco.csv.xz} Contains 24 columns. Values in the first column correspond to rownumbers, the second column {\code{AC}} holds the accession numbers of the proteins, the third column (\code{Name}) has the names of the corresponding genes, and the fourth column {\code{OLN}} lists the ordered locus names of the proteins. The remaining twenty columns (\code{A}..\code{Y}) give the numbers of the respective amino acids in each protein and are ordered alphabetically by the one-letter abbreviations of the amino acids. The sources of data for \samp{Eco.csv} are the files \samp{ECOLI.dat} \url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/entries/bacteria} and \samp{ECOLI.fas} \url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/fasta/bacteria} downloaded from the HAMAP (High-quality Automated and Manual Annotation of microbial Proteomes system) FTP site (Gattiker et al., 2003) on 2007-12-20. The proteins can be included in calculations using \code{\link{more.aa}} as well as \code{\link{read.expr}}; see the \sQuote{protactiv} vignette for an example that uses the latter function.
+    \item \code{Sce.csv.xz} Data frame of amino acid composition of proteins from the \emph{Saccharomyces} Genome Database. Contains twenty-two columns. Values in the first column are the rownumbers, the second column (\code{OLN}) has the ordered locus names of proteins, and the remaining twenty columns (\code{Ala}..\code{Val}) contain the numbers of the respective amino acids in each protein; the columns are arranged in alphabetical order based on the three-letter abbreviations for the amino acids. The source of data for \samp{Sce.csv} is the file \samp{protein_properties.tab} found on the FTP site of the SGD project on 2008-08-04. Blank entries were replaced with "NA" and column headings were added. See \code{\link{read.expr}} for examples that use this file.
+    \item \code{Eco.csv.xz} Contains 24 columns. Values in the first column correspond to rownumbers, the second column {\code{AC}} holds the accession numbers of the proteins, the third column (\code{Name}) has the names of the corresponding genes, and the fourth column {\code{OLN}} lists the ordered locus names of the proteins. The remaining twenty columns (\code{A}..\code{Y}) give the numbers of the respective amino acids in each protein and are ordered alphabetically by the one-letter abbreviations of the amino acids. The sources of data for \samp{Eco.csv} are the files \samp{ECOLI.dat} \url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/entries/bacteria} and \samp{ECOLI.fas} \url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/fasta/bacteria} downloaded from the HAMAP (High-quality Automated and Manual Annotation of microbial Proteomes system) FTP site (Gattiker et al., 2003) on 2007-12-20. The proteins can be included in calculations using \code{\link{more.aa}} as well as \code{\link{read.expr}}.
 %%    \item \code{HUM.csv.xz} Downloaded the file \code{uniprot_sprot_human.dat.gz}, dated 2010-08-10, from \url{ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/}, converted from UniProt to FASTA format using the \code{seqret} tool from EMBOSS (\url{http://emboss.sourceforge.net/}). Counted amino acid frequencies using \code{\link{read.fasta}}. Columns are as described in \code{thermo$protein}, except column \code{protein} and \code{abbrv} contain Swiss-Prot name and accession number, respectively (both taken from the header lines in the FASTA file).
   }
 

Modified: pkg/CHNOSZ/man/more.aa.Rd
===================================================================
--- pkg/CHNOSZ/man/more.aa.Rd	2012-09-24 14:13:01 UTC (rev 22)
+++ pkg/CHNOSZ/man/more.aa.Rd	2012-09-25 02:38:33 UTC (rev 23)
@@ -11,19 +11,23 @@
 
 \arguments{
   \item{protein}{character, name of protein}
-  \item{organism}{character, name of organism (\samp{Eco} or \samp{Sgd})}
+  \item{organism}{character, name of organism (\samp{Eco} or \samp{Sce})}
 }
 
 \details{
-  \code{more.aa} retrieves the amino acid composition(s) of the proteins for either \emph{Escherichia coli} or \emph{Saccharomyces cerevisiae} (for \code{organism} equal to \samp{ECO} or \samp{SGD}, respectively). The calculation depends on the data files \code{\link{extdata}/protein/Eco.csv.xz} and \code{\link{extdata}/protein/Sce.csv.xz}, which contain the amino acid compositions of proteins in these organisms. The \code{protein} can one or more Ordered Locus Names (OLN) or Open Reading Frame (ORF) names that are found in these files. The output data frame contains rows with NA compositions for names that are not matched.
+\code{more.aa} retrieves the amino acid composition(s) of the indicated proteins in either \emph{Escherichia coli} or \emph{Saccharomyces cerevisiae}.
+The value of \code{organism} can be one of \samp{Eco} or \samp{Sce}.
+The calculation depends on the data files \code{\link{extdata}/protein/Eco.csv.xz} and \code{\link{extdata}/protein/Sce.csv.xz}, which contain the amino acid compositions of the proteins.
+The \code{protein} argument should be a vector or a list of vectors of one or more Ordered Locus Names (OLN) or Open Reading Frame (ORF) names that are found in these files.
+The output data frame contains rows with NA compositions for names that are not matched.
 }
 
 \value{
-  A data frame containing the amino acid composition(s) of the specified protein(s) in the format of \code{\link{thermo}$protein}.
+A data frame, or list of data frames, containing the amino acid composition(s) of the specified protein(s) in the format of \code{\link{thermo}$protein}.
 }
 
 \seealso{
-  There are examples of using this function in conjunction with lists of protein names from \code{\link{yeastgfp}} and \code{\link{read.expr}}.
+\code{\link{extdata}} describes the sources of compositional data for the proteins. Other examples of usage of \code{more.aa} are shown for \code{\link{read.expr}}.
 }
 
 

Modified: pkg/CHNOSZ/man/read.expr.Rd
===================================================================
--- pkg/CHNOSZ/man/read.expr.Rd	2012-09-24 14:13:01 UTC (rev 22)
+++ pkg/CHNOSZ/man/read.expr.Rd	2012-09-25 02:38:33 UTC (rev 23)
@@ -33,16 +33,18 @@
 
   \code{stress} is the simplest of these functions since the source of its data, \code{stress.csv}, lists proteins without any abundance data. \code{condition} indicates the name of the stress response experiment (column name of \code{stress.csv}, e.g. \samp{low.C}) and \code{organism} indicates the organism (\samp{Eco} or \samp{Sce}).
 
-  The \code{yeastgfp} function returns the identities and abundances of proteins with the requested subcellular localization (specified in \code{location}) using data from the YeastGFP project that is stored in \code{\link{extdata}/abundance/yeastgfp.csv.xz}. The default value of \code{exclusive} (\code{FALSE}) tells the function to grab all proteins that are localized to a compartment even if they are also localized to other compartments. If \code{exclusive} is \code{TRUE}, only those proteins that are localized exclusively to the requested compartments are identified, unless there are no such proteins, then the non-exclusive localizations are used (applies to the \samp{bud} localization). The values returns by \code{yeastgfp} can be fed to \code{pdata.aa} in order to get the amino acid compositions of the proteins.
+The \code{yeastgfp} function returns the identities and abundances of proteins with the requested subcellular localization(s) (specified in \code{location}) using data from the YeastGFP project that is stored in \code{\link{extdata}/abundance/yeastgfp.csv.xz}.
+The default value of \code{exclusive} (\code{FALSE}) tells the function to grab all proteins that are localized to a compartment even if they are also localized to other compartments.
+If \code{exclusive} is \code{TRUE}, only those proteins that are localized exclusively to the requested compartments are identified, unless there are no such proteins, then the non-exclusive localizations are used (applies to the \samp{bud} localization).
 
   \code{read.expr} reads a \code{file} (CSV format) that contains protein sequence names or IDs and protein abundance data. \code{idcol} and \code{abundcol} are either the names of the columns holding the sequence IDs and protein abundances, or numeric values indicating the column numbers where these data are found. The column indicated by \code{abundcol} might not actually be abundance (it is likely to be abundance ratios). The data can be filtered to only include records that contain the term in the named argument \code{filter}, the name of which indicates the column to apply the filter to.
 
-  The function returns values of the logarithms of activities of the proteins. We associate molality with activity (i.e., activity coefficients are implicitly unity).  If \code{loga.total} is not NULL, the abundances of the proteins from the data file are scaled to give a logarithm of total activity of amino acid residues equal to the value in \code{loga.total}, usually set to zero (see \code{\link{unitize}}). This operation preserves the relative abundances of the proteins. If the abundances of the proteins in the file are already in logarithmic units, set \code{is.log} to TRUE.
-
 }
 
 \value{
-  Each of these functions returns a list with elements named \code{protein} (names of proteins) and \code{abundance} (counts or concentrations without any conversion from the units in the data file). For \code{stress}, the \code{abundance} value is all 1's. For \code{yeastgfp}, if \code{location} is NULL, the function returns the names of all locations.
+Each of these functions returns a list with elements named \code{protein} (names of proteins) and \code{abundance} (counts or concentrations without any conversion from the units in the data file).
+For \code{stress}, the \code{abundance} value is all 1's.
+For \code{yeastgfp}, if \code{location} is NULL, the function returns the names of all known locations, and if the length of \code{location} is >1, the \code{protein} and \code{abundance} values are lists of the results for each location.
 }
 
 \seealso{
@@ -204,12 +206,14 @@
 inames <- 1:length(names)
 # define the system
 basis("CHNOS+")
-# calculate amino acid compositions using "pdata.aa" function 
+# get protein names and abundances in each location
+gfp <- yeastgfp(names)
+# get amino acid compositions of proteins
+aa <- more.aa(gfp$protein, "Sce")
+# calculate average amino acid compositions 
 for(i in 1:length(names)) {
-  gfp <- yeastgfp(names[i])
-  aa <- more.aa(gfp$protein, "Sce")
-  aa <- aasum(aa, gfp$abundance, average=TRUE, protein=names[i])
-  add.protein(aa)
+  avgaa <- aasum(aa[[i]], gfp$abundance[[i]], average=TRUE, protein=names[i])
+  add.protein(avgaa)
 }
 species(names, "Sce")
 a <- affinity(H2O=c(-5,0,256),O2=c(-80,-66,256))

Modified: pkg/CHNOSZ/vignettes/anintro.Rnw
===================================================================
--- pkg/CHNOSZ/vignettes/anintro.Rnw	2012-09-24 14:13:01 UTC (rev 22)
+++ pkg/CHNOSZ/vignettes/anintro.Rnw	2012-09-25 02:38:33 UTC (rev 23)
@@ -584,7 +584,7 @@
 \section{Activity diagrams}
 
 
-\subsection{Quicker example: Bjerrum diagram}
+\subsection{Carbonate speciation (Bjerrum diagram)}
 
 The sequence of commands \texttt{basis}-\texttt{species}-\texttt{affinity}-\texttt{diagram},
 with various arguments, can be used to create a wide variety of diagrams.
@@ -599,7 +599,8 @@
 two lines are optional, unless you really do want to see the effect
 of temperature (I do!)
 
-<<Bjerrum_diagram>>=
+\setkeys{Gin}{width=0.6\textwidth}
+<<Bjerrum_diagram,fig=TRUE,width=4,height=4>>=
 basis("CHNOS+")
 species(c("CO2", "HCO3-", "CO3-2"))
 a <- affinity(pH=c(4, 12))
@@ -607,10 +608,11 @@
 a <- affinity(pH=c(4, 12), T=150)
 diagram(a, add=TRUE, col="red")
 @
+\setkeys{Gin}{width=1.0\textwidth}
 
 \selectlanguage{english}%
 
-\subsection{Quick example: stability diagram for proteins}
+\subsection{Stability diagram for proteins}
 
 Suppose that we are asked to calculate the relative stabilities of
 some proteins from different organisms. We will use part of a case
@@ -673,22 +675,6 @@
 is relatively stable at more reduced conditions.
 
 
-\subsection{How does this work?}
-
-Here is a partial explanation: You use \texttt{affinity()} to calculate
-the chemical affinities of the formation reactions of the proteins,
-taking into account chemical activities of the proteins that are set
-to reference, non-equilibrium values. Then, the \texttt{diagram()}
-function transforms these non-equilibrium affinities into chemical
-activities of the proteins at metastable equilibrium (this is actually
-achieved using the Boltzmann distribution). These activities satisfy
-the conditions that 1) the total activity of an immobile component
-(for proteins, this defaults to the protein backbone group) is constant
-and 2) the chemical affinities of the formation reactions are all
-equal (but generally not zero). More details can be found in another
-vignette (``\texttt{protactiv}'').
-
-
 \subsection{More proteins, more dimensions}
 
 Now let's add some bacterial surface-layer proteins. They are in some
@@ -896,11 +882,11 @@
 \setkeys{Gin}{width=0.7\textwidth}
 <<yeastplot, fig=TRUE, results=hide, width=7, height=5>>=
 locations <- yeastgfp()
+gfp <- yeastgfp(locations)
+aa <- more.aa(gfp$protein, "Sce")
 for(i in 1:length(locations)) {
-  gfp <- yeastgfp(locations[i])
-  aa <- more.aa(gfp$protein, "Sce")
-  aa <- aasum(aa, gfp$abundance, average=TRUE, protein=locations[i])
-  add.protein(aa)
+  avgaa <- aasum(aa[[i]], gfp$abundance[[i]], average=TRUE, protein=locations[i])
+  add.protein(avgaa)
 }
 basis("CHNOS+")
 species(locations, "Sce")
@@ -1048,13 +1034,13 @@
 basis("CHNOS")
 species(c("isoleucine", "tyrosine", "glutamic acid", "methionine", "aspartic acid"))
 f <- findit(list(CO2=c(-5, 5), O2=c(-85, -65), H2S=c(-10, 5), H2O=c(-10, 0)), 
-  niter=10, res=10, balance=1)
+  niter=5, res=10, balance=1)
 @
 \setkeys{Gin}{width=1.0\textwidth}
 \end{small}
 
 \selectlanguage{english}%
-After 10 iterations, what are the fractional equilibrium abundances
+After 5 iterations, what are the fractional equilibrium abundances
 of the amino acids? Note that, during its operation, \texttt{findit()}
 updates the activities of the basis species so we don't have to set
 them manually.

Modified: pkg/CHNOSZ/vignettes/anintro.lyx
===================================================================
--- pkg/CHNOSZ/vignettes/anintro.lyx	2012-09-24 14:13:01 UTC (rev 22)
+++ pkg/CHNOSZ/vignettes/anintro.lyx	2012-09-25 02:38:33 UTC (rev 23)
@@ -1635,10 +1635,10 @@
 
 \begin_layout Standard
 \begin_inset Branch stuff
-status open
+status collapsed
 
 \begin_layout Subsection
-Quicker example: Bjerrum diagram
+Carbonate speciation (Bjerrum diagram)
 \end_layout
 
 \begin_layout Standard
@@ -1703,11 +1703,20 @@
 
 \begin_layout Chunk
 
-<<Bjerrum_diagram>>=
+
+\backslash
+setkeys{Gin}{width=0.6
+\backslash
+textwidth}
 \end_layout
 
 \begin_layout Chunk
 
+<<Bjerrum_diagram,fig=TRUE,width=4,height=4>>=
+\end_layout
+
+\begin_layout Chunk
+
 basis("CHNOS+")
 \end_layout
 
@@ -1741,8 +1750,17 @@
 @
 \end_layout
 
+\begin_layout Chunk
+
+
+\backslash
+setkeys{Gin}{width=1.0
+\backslash
+textwidth}
+\end_layout
+
 \begin_layout Subsection
-Quick example: stability diagram for proteins
+Stability diagram for proteins
 \end_layout
 
 \begin_layout Standard
@@ -1928,44 +1946,6 @@
 \end_layout
 
 \begin_layout Subsection
-How does this work?
-\end_layout
-
-\begin_layout Standard
-Here is a partial explanation: You use 
-\family typewriter
-affinity()
-\family default
- to calculate the chemical affinities of the formation reactions of the
- proteins, taking into account chemical activities of the proteins that
- are set to reference, non-equilibrium values.
- Then, the 
-\family typewriter
-diagram()
-\family default
- function transforms these non-equilibrium affinities into chemical activities
- of the proteins at metastable equilibrium (this is actually achieved using
- the Boltzmann distribution).
- These activities satisfy the conditions that 1) the total activity of an
- immobile component (for proteins, this defaults to the protein backbone
- group) is constant and 2) the chemical affinities of the formation reactions
- are all equal (but generally not zero).
- More details can be found in another vignette (
-\begin_inset Quotes eld
-\end_inset
-
-
-\family typewriter
-protactiv
-\family default
-
-\begin_inset Quotes erd
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Subsection
 More proteins, more dimensions
 \end_layout
 
@@ -2760,27 +2740,28 @@
 
 \begin_layout Chunk
 
-for(i in 1:length(locations)) {
+gfp <- yeastgfp(locations)
 \end_layout
 
 \begin_layout Chunk
 
-  gfp <- yeastgfp(locations[i])
+aa <- more.aa(gfp$protein, "Sce")
 \end_layout
 
 \begin_layout Chunk
 
-  aa <- more.aa(gfp$protein, "Sce")
+for(i in 1:length(locations)) {
 \end_layout
 
 \begin_layout Chunk
 
-  aa <- aasum(aa, gfp$abundance, average=TRUE, protein=locations[i])
+  avgaa <- aasum(aa[[i]], gfp$abundance[[i]], average=TRUE, protein=locations[i]
+)
 \end_layout
 
 \begin_layout Chunk
 
-  add.protein(aa)
+  add.protein(avgaa)
 \end_layout
 
 \begin_layout Chunk
@@ -3358,7 +3339,7 @@
 
 \begin_layout Standard
 \begin_inset Branch more
-status collapsed
+status open
 
 \begin_layout Subsection
 Findit
@@ -3434,7 +3415,7 @@
 
 \begin_layout Chunk
 
-  niter=10, res=10, balance=1)
+  niter=5, res=10, balance=1)
 \end_layout
 
 \begin_layout Chunk
@@ -3459,7 +3440,7 @@
 \end_layout
 
 \begin_layout Standard
-After 10 iterations, what are the fractional equilibrium abundances of the
+After 5 iterations, what are the fractional equilibrium abundances of the
  amino acids? Note that, during its operation, 
 \family typewriter
 findit()