[CHNOSZ-commits] r27 - in pkg/CHNOSZ: . R inst inst/extdata/bison inst/extdata/refseq man

Sun Oct 21 15:10:12 CEST 2012

Author: jedick
Date: 2012-10-21 15:10:12 +0200 (Sun, 21 Oct 2012)
New Revision: 27

Added:
   pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq55.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq55.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq55.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq55.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq55.blastp.xz
Removed:
   pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq49.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq49.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq49.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq49.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq49.blastp.xz
Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/objective.R
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/inst/TODO
   pkg/CHNOSZ/inst/extdata/bison/gi.taxid.txt.xz
   pkg/CHNOSZ/inst/extdata/refseq/README.txt
   pkg/CHNOSZ/inst/extdata/refseq/gencat.sh
   pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh
   pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz
   pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R
   pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz
   pkg/CHNOSZ/man/extdata.Rd
   pkg/CHNOSZ/man/objective.Rd
   pkg/CHNOSZ/man/protein.info.Rd
   pkg/CHNOSZ/man/util.blast.Rd
Log:
updated to RefSeq release 55


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/DESCRIPTION	2012-10-21 13:10:12 UTC (rev 27)
@@ -1,6 +1,6 @@
-Date: 2012-10-13
+Date: 2012-10-21
 Package: CHNOSZ
-Version: 0.9.8-1
+Version: 0.9.8-2
 Title: Chemical Thermodynamics and Activity Diagrams
 Author: Jeffrey M. Dick
 Maintainer: Jeffrey M. Dick <jmdick at asu.edu>

Modified: pkg/CHNOSZ/R/objective.R
===================================================================
--- pkg/CHNOSZ/R/objective.R	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/R/objective.R	2012-10-21 13:10:12 UTC (rev 27)
@@ -150,6 +150,20 @@
   optimum="minimal"
 )
 
+DGinf <- structure(
+  function(a1, a2) {
+    dginf <- function(a1, a2) {
+      # informatic Gibbs energy/2.303RT difference between assemblages
+      p1 <- a1/sum(a1)
+      p2 <- a2/sum(a2)
+      sum(p2 * log10(p2/p1))
+    }
+    DGinf <- apply(a1, 1, dginf, a2=a2)
+    return(DGinf)
+  },
+  optimum="minimal"
+)
+
 DGtr <- structure(
   function(loga1, loga2, Astar) {
     dgtr <- function(loga1, loga2, Astar) {

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/inst/NEWS	2012-10-21 13:10:12 UTC (rev 27)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 0.9.8-1 (2012-10-13)
+CHANGES IN CHNOSZ 0.9.8-2 (2012-10-21)
 --------------------------------------
 
 SIGNIFICANT USER-VISIBLE CHANGES:
@@ -222,11 +222,11 @@
 
 EXTDATA UPDATES:
 
-- In extdata/refseq, scripts and data files updated for Reference 
-  Sequence (RefSeq) release 49 (2011-09-07).
+- In extdata/refseq, scripts and data files were updated for NCBI
+  Reference Sequence (RefSeq) release 55 (2012-09-17).
 
 - In extdata/bison, sample BLAST output files for Bison Pool metagenome
-  use target database generated from RefSeq release 49.
+  use target database generated from RefSeq release 55.
 
 - Add P(ressure) column to extdata/cpetc/SOJSH.csv and a stopifnot() 
   test for similarity to the experimental data to the example in 

Modified: pkg/CHNOSZ/inst/TODO
===================================================================
--- pkg/CHNOSZ/inst/TODO	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/inst/TODO	2012-10-21 13:10:12 UTC (rev 27)
@@ -1,3 +1,14 @@
+*********
+for 1.0.0
+*********
+
+- revisit the documentation, make it consistent with code changes!
+
+- check docs and function: does affinity return A/RT or A/2.303RT?
+
+- check examples of protein buffer calculations
+
+
 ********
 features
 ********
@@ -4,10 +15,6 @@
 
 - let diagram() map NA's - as in kyanite/sillimanite/andalusite example
 
-- implement alpha=TRUE for 2D diagram()
-
-- check docs and function: does affinity return A/RT or A/2.303RT?
-
 - make is.near.equil() a function of ep (element.potentials), not w (wjd)?
 
 

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq49.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq55.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq49.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq55.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq49.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq55.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq49.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq55.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq49.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq55.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Modified: pkg/CHNOSZ/inst/extdata/bison/gi.taxid.txt.xz
===================================================================
(Binary files differ)

Modified: pkg/CHNOSZ/inst/extdata/refseq/README.txt
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/README.txt	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/inst/extdata/refseq/README.txt	2012-10-21 13:10:12 UTC (rev 27)
@@ -1,40 +1,40 @@
 # the following data files support calculations using the 
-# RefSeq database (release 49, 2011-09-07)
+# RefSeq database (release 55, 2012-09-17)
 protein_refseq.csv: overall (average) amino acid composition of all proteins for each
-  microbial genome in the RefSeq collection (n=3471)
-taxid_names.csv: taxid, phylum name and species name for 3471 microbial taxa
+  microbial genome in the RefSeq collection (n=4567)
+taxid_names.csv: taxid, phylum name and species name for 4567 microbial taxa
 
 # these functions/scripts have the following purpose (output files listed in parentheses):
-mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq49.faa)
+mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq55.faa)
 gencat.sh - extract gi number, taxid, sequence length from RefSeq release catalog (gi.taxid.txt)
 protein.refseq.R - get average amino acid composition for each taxid from gzipped sequence files (protein_refseq.csv)
 taxid.names.R - get taxonomic names for each taxid represented (taxid_names.csv)
 
 # bash scripts assume a GNU/Linux-like operating system
-# timings were made for processing RefSeq 49 on a recent (2009) intel laptop
+# timings were made for processing RefSeq 55 on a recent (2009) intel laptop
 
 # get the list of files and entries in the database
-1. download 'release49.files.installed' and 'RefSeq-release49.catalog.gz' from NCBI
+1. download 'release55.files.installed' and 'RefSeq-release55.catalog.gz' from NCBI
    (ftp://ftp.ncbi.nih.gov/refseq/release/release-catalog)
-2. gzip -d RefSeq-release49.catalog.gz [1.7 GB]
+2. gzip -d RefSeq-release55.catalog.gz [1.7 GB]
 
 # download stuff
 3. list URLS for the microbial protein sequence files:
-     grep microbial.*.protein.faa* release49.files.installed | \
+     grep microbial.*.protein.faa* release55.files.installed | \
        sed -e "s/^/ftp\:\/\/ftp.ncbi.nih.gov\/refseq\/release\/microbial\//g" > urllist
-4. download the files using 'wget -i urllist' [1340 files, 2.0 GB]
+4. download the files using 'wget -i urllist' [1821 files, 2.8 GB]
 5. move the .gz files to a directory named 'protein'
 6. run ls protein/*.gz > filelist
-7. use 'mkfaa.sh' to combine the sequences into a single file 'refseq49.faa' [4.0 GB, ~3 minutes]
+7. use 'mkfaa.sh' to combine the sequences into a single file 'refseq55.faa' [5.5 GB, ~4 minutes]
 
 # protein stuff
-8. use 'gencat.sh' to generate gi.taxid.txt from RefSeq-release49.catalog
+8. use 'gencat.sh' to generate gi.taxid.txt from RefSeq-release55.catalog [3 minutes]
    note that the intermediate file gi.taxid.unsrt may have to be edited manually 
      -- see instructions in gencat.sh
    when done, the output of 'cat gi.taxid.txt | wc -l'  
-   should be equal to 'grep "^>" refseq49.faa | wc -l'
-   (for microbial proteins in RefSeq 49, the number is 10368471)
-9. generate protein_refseq.csv in R:  [~5.5 hours]
+   should be equal to 'grep "^>" refseq55.faa | wc -l'
+   (for microbial proteins in RefSeq 55, the number is 14162697)
+9. generate protein_refseq.csv in R:  [~8.9 hours]
    > source("protein.refseq.R")
    > protein.refseq()
    note that this depends on gi.taxid.txt and the .faa.gz files in the 'protein' directory
@@ -42,5 +42,5 @@
 # taxonomy stuff
 10. edit 'taxid.names.R' so that 'taxdir' points to the directory where the files
     'names.dmp' and 'nodes.dmp' are present. these files can be downloaded from
-    ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2011-09-06)
-11. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~1.5 hours]
+    ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2012-09-19)
+11. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~2.5 hours]

Modified: pkg/CHNOSZ/inst/extdata/refseq/gencat.sh
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/gencat.sh	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/inst/extdata/refseq/gencat.sh	2012-10-21 13:10:12 UTC (rev 27)
@@ -1,6 +1,6 @@
 #/bin/sh
 # extract microbial, genomic records from the RefSeq catalog
-RELEASE=49
+RELEASE=55
 ORG=microbial
 MOL=protein
 BASENAME=RefSeq-release$RELEASE.catalog 
@@ -24,7 +24,7 @@
 # for some reason the first line in gi.taxid.unsrt needs to be corrected manually
 # (found using both RefSeq 45 and 47)
 # str. 316407 W3110 --> 89106885 316407 21
-# (using RefSeq49)
+# (using RefSeq 49 and 55)
 # NP_047184.1 9 PROVISIONAL --> 10954455 9 280
 
 # sort the file on gi so that it can be used with e.g. the unix 'join' command

Modified: pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh	2012-10-21 13:10:12 UTC (rev 27)
@@ -1,6 +1,6 @@
 # send the contents of all the .faa.gz files to a single file ("OUTFILE")
 
-OUTFILE="refseq49.faa"
+OUTFILE="refseq55.faa"
 FILELIST="filelist"
 
 # start with an empty file

Modified: pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz
===================================================================
(Binary files differ)

Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R	2012-10-21 13:10:12 UTC (rev 27)
@@ -3,7 +3,7 @@
 # for each of the microbial taxa in RefSeq database
 
 # change this to the location where names.dmp and nodes.dmp are located
-taxdir <- "/home/download/sequences/taxonomy/refseq49"
+taxdir <- "/home/download/sequences/taxonomy/refseq55"
 
 # get the taxids from protein_refseq.csv
 pr <- read.csv("protein_refseq.csv.xz")

Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz
===================================================================
(Binary files differ)

Modified: pkg/CHNOSZ/man/extdata.Rd
===================================================================
--- pkg/CHNOSZ/man/extdata.Rd	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/man/extdata.Rd	2012-10-21 13:10:12 UTC (rev 27)
@@ -20,7 +20,7 @@
 
   Files in \code{bison} contain BLAST results and taxonomic information for a metagenome:
   \itemize{
-    \item \code{bisonN_vs_refseq49.blast.xz}, \code{bisonS_vs_refseq49.blast.xz}, \code{bisonR_vs_refseq49.blast.xz}, \code{bisonQ_vs_refseq49.blast.xz}, \code{bisonP_vs_refseq49.blast.xz} are partial tabular BLAST results for proteins in the Bison Pool Environmental Genome. Protein sequences predicted in the metagenome were downloaded from the Joint Genome Institute's IMG/M system on 2009-05-13. The target database for the searches was constructed from microbial protein sequences in National Center for Biotechnology Information (NCBI) RefSeq database version 49, representing 3471 microbial genomes. The \sQuote{blastall} command was used with the default setting for E value cuttoff (10.0) and options to make a tabular output file consisting of the top 20 hits for each query sequence. The function \code{\link{read.blast}} was used to extract only those hits with E values less than or equal to 1e-5 and with sequence similarity (percent identity) at least 30 percent, and to keep only the first hit for each query sequence. The function \code{\link{write.blast}} was used to save partial BLAST files (only selected columns). The files provided with CHNOSZ contain the first 5,000 hits for each sampling site at Bison Pool, representing between about 7 to 15 percent of the first BLAST hits after similarity and E value filtering.
+    \item \code{bisonN_vs_refseq55.blast.xz}, \code{bisonS_vs_refseq55.blast.xz}, \code{bisonR_vs_refseq55.blast.xz}, \code{bisonQ_vs_refseq55.blast.xz}, \code{bisonP_vs_refseq55.blast.xz} are partial tabular BLAST results for proteins in the Bison Pool Environmental Genome. Protein sequences predicted in the metagenome were downloaded from the Joint Genome Institute's IMG/M system on 2009-05-13. The target database for the searches was constructed from microbial protein sequences in National Center for Biotechnology Information (NCBI) RefSeq database version 55, representing 4567 microbial genomes. The \sQuote{blastall} command was used with the default setting for E value cuttoff (10.0) and options to make a tabular output file consisting of the top 20 hits for each query sequence. The function \code{\link{read.blast}} was used to extract only those hits with E values less than or equal to 1e-5 and with sequence similarity (percent identity) at least 30 percent, and to keep only the first hit for each query sequence. The function \code{\link{write.blast}} was used to save partial BLAST files (only selected columns). The files provided with CHNOSZ contain the first 5,000 hits for each sampling site at Bison Pool, representing between about 7 to 15 percent of the first BLAST hits after similarity and E value filtering.
     \item \code{gi.taxid.txt.xz} is a table that lists the sequence identifiers (gi numbers) that appear in the example BLAST files (see above), together with the corresponding taxon ids used in the NCBI databases. This file is \emph{not} a subset of the complete \sQuote{gi_taxid_prot.dmp.gz} available at \url{ftp://ftp.ncbi.nih.gov/pub/taxonomy/} but instead is a subset of \sQuote{gi.taxid.txt} generated from the RefSeq release catalog using \sQuote{gencat.sh} in the \code{refseq} directory. See \code{\link{id.blast}} for an example that uses this file and the BLAST files described above.
   }
 
@@ -50,7 +50,7 @@
   }
 
 
-  Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, updated for RefSeq release 49 of 2011-09-07:
+  Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, updated for RefSeq release 55 of 2012-09-17:
   \itemize{
     \item \code{README.txt} Instructions for producing the data files.
     \item \code{gencat.sh} Bash script to extract microbial protein records from the RefSeq catalog.

Modified: pkg/CHNOSZ/man/objective.Rd
===================================================================
--- pkg/CHNOSZ/man/objective.Rd	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/man/objective.Rd	2012-10-21 13:10:12 UTC (rev 27)
@@ -11,6 +11,7 @@
 \alias{RMSD}
 \alias{CVRMSD}
 \alias{DDGmix}
+\alias{DGinf}
 \alias{DGtr}
 \alias{get.objfun}
 \title{Objective Functions}
@@ -31,6 +32,7 @@
   RMSD(loga1, loga2)
   CVRMSD(loga1, loga2)
   DDGmix(loga1, loga2)
+  DGinf(a1, a2)
   DGtr(loga1, loga2, Astar)
   get.objfun(objective)
 }
@@ -39,6 +41,7 @@
   \item{a1}{numeric matrix, chemical activities of species}
   \item{loga1}{numeric matrix, logarithms of activity}
   \item{loga2}{numeric, reference values of logarithms of activity}
+  \item{a2}{numeric, reference values of activity}
   \item{Astar}{numeric, reference values of chemical affinity}
   \item{objective}{character, name of objective function}
 }
@@ -46,7 +49,7 @@
 \details{
 
 The value in \code{a1} or \code{loga1} is a matrix of chemical activities or logarithms of activity with a column for each species, and a row for each chemical condition.
-Except for calculations of the Shannon entropy, all logarithmic bases are decimal.
+Except for calculations of the Shannon entropy, all logarithmic bases (including in the equations below) are decimal.
 
 \code{SD}, \code{CV} and \code{shannon} calculate the standard deviation, coefficient of variation, and Shannon entropy for the values in each row of \code{a1}. The Shannon entropy is calculated from the fractional abundances: H = sum(-p * log(p)) (natural logarithm), where p=a1/sum(a1).
 
@@ -61,6 +64,11 @@
 
 \code{DDGmix} calculates the difference in Gibbs energy/2.303RT of ideal mixing between the assemblages with logarithms of activity \code{loga1} and \code{loga2}.
 
+\code{DGinf} calculates the difference in Gibbs energy/2.303RT attributed to relative informatic entropy between an initial assemblage with activities \code{a2} and final assemblage(s) with activities with activities in each row of \code{a1}.
+The equation used is DGinf/2.303RT = sum(p2 * (logp2 - logp1)), where p1 and p2 are the proportions, i.e. p1 = a1 / sum(a1) and p2 = a2 / sum(a2). 
+This equation has the form of the Kullback-Leibler divergence, sometimes known as relative entropy (Ludovisi and Taticchi, 2006).
+In specific cases (systems where formulas of species are normalized by the balancing coefficients), the values of \code{DGinf} and \code{DGtr} are equal.
+
 \code{DGtr} calculates the change in Gibbs energy/2.303RT of a system in which species with initial logarithms of activitiy (\code{loga1}) are transformed to the same species with different final logarithms of activity (\code{loga2}) at constant temperature, pressure and chemical potentials of basis species.
 It is calculated as the sum over species of (G2-G1) where G1/RT = -a1*Astar + a1*loga1 - a1 + a constant (where a1 is 10^loga1), likewise for G2, and where \code{Astar} is the starved affinity, that is the affinity of the reaction to form one mole of the species at unit activity from the basis species in their defined activities.
 The equation used arises from integrating dG = -A/dxi = -A/dn where xi is the reaction progress variable, dn/dxi = 1 is the reaction coefficient on the species, and A = Astar - 2.303RTloga is the chemical affinity. 
@@ -99,7 +107,7 @@
 # take a reference equilibrium distribution at logfH2 = -7.5
 loga1 <- list2array(e$loga.equil)[51, ]
 Astar <- list2array(e$Astar)[51, ]
-# equilibrium at any other logfH2 is not equilibrium at logfH2 = -75
+# equilibrium at any other logfH2 is not equilibrium at logfH2 = -7.5
 DGtr.out <- DDGmix.out <- numeric()
 for(i in 1:length(a$vals[[1]])) {
   loga2 <- list2array(e$loga.equil)[i, ]
@@ -130,6 +138,9 @@
 
   Anderson, G. M. (2005) \emph{Thermodynamics of Natural Systems}, 2nd ed., Cambridge University Press, 648 p. \url{http://www.worldcat.org/oclc/474880901}
 
+  Ludovisi, A. and Taticchi, M. I. (2006) Investigating beta diversity by Kullback-Leibler information measures. \emph{Ecological Modelling} \bold{192}, 299--313. \url{http://dx.doi.org/10.1016/j.ecolmodel.2005.05.022}
+
+
 }
 
 \keyword{secondary}

Modified: pkg/CHNOSZ/man/protein.info.Rd
===================================================================
--- pkg/CHNOSZ/man/protein.info.Rd	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/man/protein.info.Rd	2012-10-21 13:10:12 UTC (rev 27)
@@ -82,8 +82,8 @@
 
 ## using protein.formula: average oxidation state of 
 ## carbon of proteins from different organisms
-# get amino acid compositions of proteins 
-# generated from the RefSeq database
+# get amino acid compositions of microbial proteins 
+# generated from the RefSeq database 
 file <- system.file("extdata/refseq/protein_refseq.csv.xz", package="CHNOSZ")
 ip <- add.protein(read.aa(file))
 # only use those organisms with a certain
@@ -93,9 +93,9 @@
 zc <- ZC(pf)
 # the organism names we search for
 # "" matches all organisms
-terms <- c("Streptomyces", "Pseudomonas", "Salmonella",
-  "Escherichia", "Vibrio", "Bacteroides", "Lactobacillus",
-  "Staphylococcus", "Streptococcus", "Methano", "Bacillus", "Thermo","")
+terms <- c("Halo", "Streptomyces", "Pseudomonas", "Salmonella",
+  "Escherichia", "Bacteroides", "Lactobacillus", "Staphylococcus",
+  "Streptococcus", "Methano", "Bacillus", "Thermo", "")
 tps <- thermo$protein$ref[ip]
 plot(0, 0, xlim=c(1, 13), ylim=c(-0.3, -0.05), pch="",
   ylab="average oxidation state of carbon in proteins",
@@ -105,7 +105,7 @@
   zct <- zc[it]
   points(jitter(rep(i, length(zct))), zct, pch=20)
 }
-terms[13] <- paste("all organisms")
+terms[13] <- paste("all microbial")
 axis(1, 1:13, terms, las=2)
 title(main=paste("Average Oxidation State of Carbon:",
   "Total Protein per taxID in NCBI RefSeq", sep="\n"))

Modified: pkg/CHNOSZ/man/util.blast.Rd
===================================================================
--- pkg/CHNOSZ/man/util.blast.Rd	2012-10-13 06:08:14 UTC (rev 26)
+++ pkg/CHNOSZ/man/util.blast.Rd	2012-10-21 13:10:12 UTC (rev 27)
@@ -76,7 +76,7 @@
   taxid.names <- read.csv(nfile)
   # the BLAST files
   sites <- c("N","S","R","Q","P")
-  bfile <- paste("extdata/bison/bison", sites, "_vs_refseq49.blastp.xz", sep="")
+  bfile <- paste("extdata/bison/bison", sites, "_vs_refseq55.blastp.xz", sep="")
   for(i in 1:5) {
     file <- system.file(bfile[i], package="CHNOSZ")
     # read the blast file, with default filtering settings