[CHNOSZ-commits] r49 - in pkg/CHNOSZ: . R inst inst/doc inst/extdata/bison inst/extdata/refseq man

Sun Mar 24 08:04:53 CET 2013

Author: jedick
Date: 2013-03-24 08:04:53 +0100 (Sun, 24 Mar 2013)
New Revision: 49

Added:
   pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq57.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq57.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq57.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq57.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq57.blastp.xz
   pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R
Removed:
   pkg/CHNOSZ/inst/doc/anintro.pdf
   pkg/CHNOSZ/inst/doc/equilibrium.pdf
   pkg/CHNOSZ/inst/doc/hotspring.pdf
   pkg/CHNOSZ/inst/doc/wjd.pdf
   pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq55.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq55.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq55.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq55.blastp.xz
   pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq55.blastp.xz
Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/diagram.R
   pkg/CHNOSZ/R/revisit.R
   pkg/CHNOSZ/R/util.data.R
   pkg/CHNOSZ/inst/CHECKLIST
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/inst/extdata/bison/gi.taxid.txt.xz
   pkg/CHNOSZ/inst/extdata/refseq/README.txt
   pkg/CHNOSZ/inst/extdata/refseq/gencat.sh
   pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh
   pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz
   pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R
   pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz
   pkg/CHNOSZ/man/extdata.Rd
   pkg/CHNOSZ/man/protein.info.Rd
   pkg/CHNOSZ/man/sideeffects.Rd
   pkg/CHNOSZ/man/util.blast.Rd
Log:
update extdata/refseq for RefSeq release 57


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/DESCRIPTION	2013-03-24 07:04:53 UTC (rev 49)
@@ -1,11 +1,11 @@
-Date: 2013-03-15
+Date: 2013-03-24
 Package: CHNOSZ
-Version: 0.9-9.9
+Version: 0.9-9.10
 Title: Chemical Thermodynamics and Activity Diagrams
 Author: Jeffrey M. Dick
-Maintainer: Jeffrey M. Dick <jmdick at asu.edu>
+Maintainer: Jeffrey M. Dick <j3ffdick at gmail.com>
 Depends: R (>= 2.12.0), utils
-Suggests: testthat, parallel, limSolve
+Suggests: limSolve, parallel, testthat
 Description: This package includes functions and data sets to support chemical thermodynamic 
   modeling in biochemistry and low-temperature geochemistry. The features include calculation
   of the standard molal thermodynamic properties and chemical affinities of reactions involving 

Modified: pkg/CHNOSZ/R/diagram.R
===================================================================
--- pkg/CHNOSZ/R/diagram.R	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/R/diagram.R	2013-03-24 07:04:53 UTC (rev 49)
@@ -288,7 +288,7 @@
           iy <- which(y1==y2)
           ys[iy*2] <- NA
           # no line segment at a dotted position
-          iyd <- which(ys%%dotted==0)
+          iyd <- rowSums(sapply(dotted, function(y) ys%%y==0)) > 0
           ys[iyd] <- NA
           return(list(xs=xs, ys=ys))
         }
@@ -302,17 +302,17 @@
           ix <- which(x1==x2)
           xs[ix*2] <- NA
           # no line segment at a dotted position
-          ixd <- which(xs%%dotted==0)
+          ixd <- rowSums(sapply(dotted, function(x) xs%%x==0)) > 0
           xs[ixd] <- NA
           return(list(xs=xs, ys=ys))
         }
         clipfun <- function(z, zlim) {
           if(zlim[2] > zlim[1]) {
-            z[z>zlim[2]] <- zlim[2]
-            z[z<zlim[1]] <- zlim[1]
+            z[z>zlim[2]] <- NA
+            z[z<zlim[1]] <- NA
           } else {
-            z[z>zlim[1]] <- zlim[1]
-            z[z<zlim[2]] <- zlim[2]
+            z[z>zlim[1]] <- NA
+            z[z<zlim[2]] <- NA
           }
           return(z)
         }
@@ -328,6 +328,7 @@
         xs <- xlim[1] + (xs - 0.5) * rx
         ys <- ylim[1] + (ys - 0.5) * ry
         ys <- clipfun(ys, ylim)
+        if(!is.null(xrange)) xs <- clipfun(xs, xrange)
         lines(xs, ys, col=col, lwd=lwd)
         # horizontal lines
         xs <- ys <-NA
@@ -340,7 +341,7 @@
         ys <- ylim[2] - (ys - 0.5) * ry
         xs <- clipfun(xs, xlim)
         if(!is.null(xrange)) xs <- clipfun(xs, xrange)
-        lines(xs,ys, col=col, lwd=lwd)
+        lines(xs, ys, col=col, lwd=lwd)
       }
       ## label plot function
       # calculate coordinates for field labels

Modified: pkg/CHNOSZ/R/revisit.R
===================================================================
--- pkg/CHNOSZ/R/revisit.R	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/R/revisit.R	2013-03-24 07:04:53 UTC (rev 49)
@@ -41,6 +41,9 @@
     for(i in 1:xres) y <- c(y, which.max(z[i,]))
     for(i in 1:yres) x <- c(x, which.max(z[,i]))
   }
+  # stop if we missed some
+  if(length(x)!=xres) stop("optima not found for all y")
+  if(length(y)!=yres) stop("optima not found for all x")
   return(list(x=x, y=y))
 }
 

Modified: pkg/CHNOSZ/R/util.data.R
===================================================================
--- pkg/CHNOSZ/R/util.data.R	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/R/util.data.R	2013-03-24 07:04:53 UTC (rev 49)
@@ -88,7 +88,8 @@
       if(isTRUE(all.equal(oldprop, args[iold[i], ], check.attributes=FALSE))) 
         msgout("mod.obigt: no change for ", args$name[iold[i]], "(", state, ")\n")
       else {
-        thermo$obigt[ispecies[iold[i]], icol] <<- args[iold[i], ]
+        thermo$obigt[ispecies[iold[i]], icol] <- args[iold[i], ]
+        assign("thermo", thermo, "CHNOSZ")
         msgout("mod.obigt: updated ", args$name[iold[i]], "(", state, ")\n")
       }
     }

Modified: pkg/CHNOSZ/inst/CHECKLIST
===================================================================
--- pkg/CHNOSZ/inst/CHECKLIST	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/inst/CHECKLIST	2013-03-24 07:04:53 UTC (rev 49)
@@ -2,9 +2,12 @@
 release checklist for CHNOSZ
 ****************************
 
+- remove all "<<-" from sources (leftover from debugging)
+
 - update list of documentation topics in examples() with any new ones
 
-- run examples() to make sure that all examples can be run (includes \donttest ones)
+- run examples() to make sure that all examples can be run
+  (that includes \donttest ones that aren't run by R CMD check)
 
 - run demos() to run all demos
 
@@ -28,9 +31,12 @@
 
 - ensure all Rd files have \keyword{...}
 
-- test read.supcrt(), write.supcrt() on e.g. slop98.dat
-
 - update .Rinstignore with any new/deleted files
 
 - run R CMD check using R compiled without long doubles (as in Solaris checks on CRAN)
   (CFLAGS=-ffloat-store ./configure --disable-long-double)
+
+- build the package on the source directory:
+  R CMD build --compact-vignettes chnosz
+  (don't use --no-vignettes, that keeps the Rnw files from being copied to inst/doc,
+   causing vignettes to disappear from CRAN webpage)

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/inst/NEWS	2013-03-24 07:04:53 UTC (rev 49)
@@ -1,5 +1,5 @@
-CHANGES IN CHNOSZ 0.9-9.9 (2013-03-15)
---------------------------------------
+CHANGES IN CHNOSZ 0.9-9.10 (2013-03-24)
+---------------------------------------
 
 MAJOR USER-VISIBLE CHANGE:
 

Deleted: pkg/CHNOSZ/inst/doc/anintro.pdf
===================================================================
(Binary files differ)

Deleted: pkg/CHNOSZ/inst/doc/equilibrium.pdf
===================================================================
(Binary files differ)

Deleted: pkg/CHNOSZ/inst/doc/hotspring.pdf
===================================================================
(Binary files differ)

Deleted: pkg/CHNOSZ/inst/doc/wjd.pdf
===================================================================
(Binary files differ)

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq57.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonN_vs_refseq57.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq57.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonP_vs_refseq57.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq57.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonQ_vs_refseq57.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq57.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonR_vs_refseq57.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Deleted: pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq55.blastp.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq57.blastp.xz
===================================================================
(Binary files differ)


Property changes on: pkg/CHNOSZ/inst/extdata/bison/bisonS_vs_refseq57.blastp.xz
___________________________________________________________________
Added: svn:mime-type
   + application/x-xz

Modified: pkg/CHNOSZ/inst/extdata/bison/gi.taxid.txt.xz
===================================================================
(Binary files differ)

Modified: pkg/CHNOSZ/inst/extdata/refseq/README.txt
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/README.txt	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/inst/extdata/refseq/README.txt	2013-03-24 07:04:53 UTC (rev 49)
@@ -1,46 +1,50 @@
 # the following data files support calculations using the 
-# RefSeq database (release 55, 2012-09-17)
+# RefSeq database (release 57, 2013-01-08)
 protein_refseq.csv: overall (average) amino acid composition of all proteins for each
-  microbial genome in the RefSeq collection (n=4567)
-taxid_names.csv: taxid, phylum name and species name for 4567 microbial taxa
+  microbial genome in the RefSeq collection (n=7415)
+taxid_names.csv: taxid, phylum name and species name for 7415 microbial taxa
 
 # these functions/scripts have the following purpose (output files listed in parentheses):
-mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq55.faa)
+mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq57.faa)
 gencat.sh - extract gi number, taxid, sequence length from RefSeq release catalog (gi.taxid.txt)
 protein.refseq.R - get average amino acid composition for each taxid from gzipped sequence files (protein_refseq.csv)
 taxid.names.R - get taxonomic names for each taxid represented (taxid_names.csv)
 
 # bash scripts assume a GNU/Linux-like operating system
-# timings were made for processing RefSeq 55 on a recent (2009) intel laptop
+# timings were made for processing RefSeq 57 on a recent (2009) intel laptop
 
-# get the list of files and entries in the database
-1. download 'release55.files.installed' and 'RefSeq-release55.catalog.gz' from NCBI
+# download stuff
+1. download 'release57.files.installed' and 'RefSeq-release57.catalog.gz' from NCBI
    (ftp://ftp.ncbi.nih.gov/refseq/release/release-catalog)
-2. gzip -d RefSeq-release55.catalog.gz [1.7 GB]
-
-# download stuff
-3. list URLS for the microbial protein sequence files:
-     grep microbial.*.protein.faa* release55.files.installed | \
+2. list URLS for the microbial protein sequence files:
+     grep microbial.*.protein.faa* release57.files.installed | \
        sed -e "s/^/ftp\:\/\/ftp.ncbi.nih.gov\/refseq\/release\/microbial\//g" > urllist
-4. download the files using 'wget -i urllist' [1821 files, 2.8 GB]
-5. move the .gz files to a directory named 'protein'
-6. run ls protein/*.gz > filelist
-7. use 'mkfaa.sh' to combine the sequences into a single file 'refseq55.faa' [5.5 GB, ~4 minutes]
+3. download the files using 'wget -i urllist' [3227 files, 4.6 GB]
+4. move the .gz files to a directory named 'protein'
 
 # protein stuff
-8. use 'gencat.sh' to generate gi.taxid.txt from RefSeq-release55.catalog [3 minutes]
-   note that the intermediate file gi.taxid.unsrt may have to be edited manually 
-     -- see instructions in gencat.sh
-   when done, the output of 'cat gi.taxid.txt | wc -l'  
-   should be equal to 'grep "^>" refseq55.faa | wc -l'
-   (for microbial proteins in RefSeq 55, the number is 14162697)
-9. generate protein_refseq.csv in R:  [~8.9 hours]
+5. gzip -d RefSeq-release57.catalog.gz [3.1 GB]
+6. use 'gencat.sh' to generate gi.taxid.txt for microbial proteins in the catalog [7 minutes]
+   for RefSeq57, 'cat gi.taxid.txt | wc -l' is 24488527
+7. generate protein_refseq.csv in R:  [~19 hours]
    > source("protein.refseq.R")
    > protein.refseq()
    note that this depends on gi.taxid.txt and the .faa.gz files in the 'protein' directory
+8. trim entries from protein_refseq.csv (smaller size, better for package distribution)
+   > source("trim_refseq.R")
 
 # taxonomy stuff
-10. edit 'taxid.names.R' so that 'taxdir' points to the directory where the files
+9. edit 'taxid.names.R' so that 'taxdir' points to the directory where the files
     'names.dmp' and 'nodes.dmp' are present. these files can be downloaded from
-    ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2012-09-19)
-11. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~2.5 hours]
+    ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2013-01-15)
+10. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~4.5 hours]
+
+# BLAST stuff (optional)
+11. run ls protein/*.gz > filelist
+12. use 'mkfaa.sh' to combine the sequences into a single file 'refseq57.faa' [9.3 GB, 11 minutes]
+    for RefSeq57, 'grep "^>" refseq57.faa | wc -l' is 24477649
+    the difference from the catalog (step 6 above), is 10878 sequences:
+    taxid 1211777 (Rhizobium mesoamericanum STM3625) (6356 sequences) 
+    taxid 313627 (Bacillus sp. NRRL B-14911) (4522 sequences)
+13. make a BLAST database, e.g. formatdb -t refseq57 -i refseq57.faa -p T
+

Modified: pkg/CHNOSZ/inst/extdata/refseq/gencat.sh
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/gencat.sh	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/inst/extdata/refseq/gencat.sh	2013-03-24 07:04:53 UTC (rev 49)
@@ -1,6 +1,6 @@
 #/bin/sh
 # extract microbial, genomic records from the RefSeq catalog
-RELEASE=55
+RELEASE=57
 ORG=microbial
 MOL=protein
 BASENAME=RefSeq-release$RELEASE.catalog 
@@ -12,20 +12,16 @@
 # alternatively, could use egrep:
 #egrep "[[:space:]]AP_ | [[:space:]]NP_ | [[:space:]]XP_ | \
 #  [[:space:]]YP_ | [[:space:]]ZP_" $BASENAME.$ORG  > $BASENAME.$ORG.$MOL
-grep "[[:space:]]AP_" $BASENAME.$ORG  > $BASENAME.$ORG.$MOL
-grep "[[:space:]]NP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL
-grep "[[:space:]]XP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL
-grep "[[:space:]]YP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL
-grep "[[:space:]]ZP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL
+grep "[[:space:]]AP_" $BASENAME.$ORG  > $BASENAME.$ORG.$MOL  # 0 records
+grep "[[:space:]]NP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL  # 450218 records
+grep "[[:space:]]XP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL  # 0 records
+grep "[[:space:]]YP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL  # 6786156 records
+grep "[[:space:]]ZP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL  # 17252153 records
 
-# to save only the gi, taxid and sequence length columns
-cat $BASENAME.$ORG.$MOL | awk '{FS="\t"} {print $4,$1,$7}' > gi.taxid.unsrt
+# to save only the gi, taxid and sequence length columns, in that order
+# the field separator (tab) is defined in command line, not in awk program,
+#   otherwise the first line gets processed incorrectly
+cat $BASENAME.$ORG.$MOL | awk -F\\t '{print $4,$1,$7}' > gi.taxid.unsrt
 
-# for some reason the first line in gi.taxid.unsrt needs to be corrected manually
-# (found using both RefSeq 45 and 47)
-# str. 316407 W3110 --> 89106885 316407 21
-# (using RefSeq 49 and 55)
-# NP_047184.1 9 PROVISIONAL --> 10954455 9 280
-
 # sort the file on gi so that it can be used with e.g. the unix 'join' command
 cat gi.taxid.unsrt | sort > gi.taxid.txt

Modified: pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh	2013-03-24 07:04:53 UTC (rev 49)
@@ -1,6 +1,6 @@
 # send the contents of all the .faa.gz files to a single file ("OUTFILE")
 
-OUTFILE="refseq55.faa"
+OUTFILE="refseq57.faa"
 FILELIST="filelist"
 
 # start with an empty file

Modified: pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz
===================================================================
(Binary files differ)

Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R	2013-03-24 07:04:53 UTC (rev 49)
@@ -3,7 +3,7 @@
 # for each of the microbial taxa in RefSeq database
 
 # change this to the location where names.dmp and nodes.dmp are located
-taxdir <- "/home/download/sequences/taxonomy/refseq55"
+taxdir <- "./taxdump"
 
 # get the taxids from protein_refseq.csv
 pr <- read.csv("protein_refseq.csv.xz")

Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz
===================================================================
(Binary files differ)

Added: pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R	                        (rev 0)
+++ pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R	2013-03-24 07:04:53 UTC (rev 49)
@@ -0,0 +1,66 @@
+# trim the protein_refseq.csv, removing some entries with highly-represented names
+# (to keep the file size down for CHNOSZ package)
+# 20130320 jmd
+
+# the original file (7415 rows)
+pr <- read.csv("protein_refseq.csv")
+# the common names (comments show number in RefSeq 57
+names <- c(
+  "Escherichia coli",  # 662
+  "Streptococcus",     # 432
+  "Salmonella",        # 299
+  "Staphylococcus",    # 290
+  "Enterococcus",      # 218
+  "Vibrio",            # 190
+  "Lactobacillus",     # 179
+  "Helicobacter",      # 164
+  "Pseudomonas",       # 155
+  "Mycobacterium",     # 138
+  "Campylobacter",     # 131
+  "Neisseria",         # 121
+  "Clostridium",       # 118
+  "Yersinia",          # 111
+  "Bacillus cereus",   # 105
+  "Acinetobacter",     # 103
+  "Propionibacterium", # 93
+  "Burkholderia",      # 91
+  "Candidatus",        # 83
+  "Bacteroides",       # 80
+  "Mycoplasma",        # 73
+  "Streptomyces",      # 72
+  "Corynebacterium",   # 70
+  "Listeria",          # 60
+  "Leptospira",        # 57
+  "Klebsiella",        # 57
+  "Bifidobacterium",   # 55
+  "Brucella",          # 53
+  "Shigella",          # 50
+  "Haemophilus",       # 47
+  "Rickettsia",        # 46
+  "Prevotella",        # 44
+  "Chlamydia",         # 42
+  "Francisella",       # 37
+  "Bacillus thuringiensis", # 36
+  "Borrelia",          # 34
+  "Fusobacterium",     # 33
+  "Xanthomonas",       # 31
+  "Rhizobium",         # 27
+  "Bartonella",        # 26
+  "Pseudoalteromonas", # 24
+  "Bacillus anthracis", # 23
+  "Actinomyces",       # 23
+  "Treponema",         # 21
+  "Actinobacillus",    # 20
+  "Aggregatibacter",   # 20
+  "Gardnerella"        # 19
+)
+
+# loop over the names, identify rows, leave the first row
+for(name in names) {
+  iname <- grep(name, pr$ref)
+  iname <- tail(iname, -1)
+  pr <- pr[-iname, ]
+}
+
+# we're left with 2600 rows 
+write.csv(pr, "protein_refseq.csv", row.names=FALSE)

Modified: pkg/CHNOSZ/man/extdata.Rd
===================================================================
--- pkg/CHNOSZ/man/extdata.Rd	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/man/extdata.Rd	2013-03-24 07:04:53 UTC (rev 49)
@@ -21,7 +21,7 @@
 
   Files in \code{bison} contain BLAST results and taxonomic information for a metagenome:
   \itemize{
-    \item \code{bisonN_vs_refseq55.blast.xz}, \code{bisonS_vs_refseq55.blast.xz}, \code{bisonR_vs_refseq55.blast.xz}, \code{bisonQ_vs_refseq55.blast.xz}, \code{bisonP_vs_refseq55.blast.xz} are partial tabular BLAST results for proteins in the Bison Pool Environmental Genome. Protein sequences predicted in the metagenome were downloaded from the Joint Genome Institute's IMG/M system on 2009-05-13. The target database for the searches was constructed from microbial protein sequences in National Center for Biotechnology Information (NCBI) RefSeq database version 55, representing 4567 microbial genomes. The \sQuote{blastall} command was used with the default setting for E value cuttoff (10.0) and options to make a tabular output file consisting of the top 20 hits for each query sequence. The function \code{\link{read.blast}} was used to extract only those hits with E values less than or equal to 1e-5 and with sequence similarity (percent identity) at least 30 percent, and to keep only the first hit for each query sequence. The function \code{\link{write.blast}} was used to save partial BLAST files (only selected columns). The files provided with CHNOSZ contain the first 5,000 hits for each sampling site at Bison Pool, representing between about 7 to 15 percent of the first BLAST hits after similarity and E value filtering.
+    \item \code{bisonN_vs_refseq57.blast.xz}, \code{bisonS_vs_refseq57.blast.xz}, \code{bisonR_vs_refseq57.blast.xz}, \code{bisonQ_vs_refseq57.blast.xz}, \code{bisonP_vs_refseq57.blast.xz} are partial tabular BLAST results for proteins in the Bison Pool Environmental Genome. Protein sequences predicted in the metagenome were downloaded from the Joint Genome Institute's IMG/M system on 2009-05-13. The target database for the searches was constructed from microbial protein sequences in National Center for Biotechnology Information (NCBI) RefSeq database version 57, representing 7415 microbial genomes. The \sQuote{blastall} command was used with the default setting for E value cuttoff (10.0) and options to make a tabular output file consisting of the top 20 hits for each query sequence. The function \code{\link{read.blast}} was used to extract only those hits with E values less than or equal to 1e-5 and with sequence similarity (percent identity) at least 30 percent, and to keep only the first hit for each query sequence. The function \code{\link{write.blast}} was used to save partial BLAST files (only selected columns). The files provided with CHNOSZ contain the first 5,000 hits for each sampling site at Bison Pool, representing between about 7 to 15 percent of the first BLAST hits after similarity and E value filtering.
     \item \code{gi.taxid.txt.xz} is a table that lists the sequence identifiers (gi numbers) that appear in the example BLAST files (see above), together with the corresponding taxon ids used in the NCBI databases. This file is \emph{not} a subset of the complete \sQuote{gi_taxid_prot.dmp.gz} available at \url{ftp://ftp.ncbi.nih.gov/pub/taxonomy/} but instead is a subset of \sQuote{gi.taxid.txt} generated from the RefSeq release catalog using \sQuote{gencat.sh} in the \code{refseq} directory. See \code{\link{id.blast}} for an example that uses this file and the BLAST files described above.
   }
 
@@ -51,13 +51,14 @@
   }
 
 
-  Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, updated for RefSeq release 55 of 2012-09-17:
+  Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, using RefSeq release 57 of 2013-01-08:
   \itemize{
     \item \code{README.txt} Instructions for producing the data files.
     \item \code{gencat.sh} Bash script to extract microbial protein records from the RefSeq catalog.
     \item \code{gi.taxid.txt} Output from above. The complete file is too large to distribute with CHNOSZ, but a portion is included in \code{extdata/bison} to support processing example BLAST files for the Bison Pool metagenome.
     \item \code{mkfaa.sh} Combine the contents of .faa.gz files into a single FASTA file (to use e.g. for making a BLAST database).
     \item \code{protein.refseq.R} Calculate average amino acid composition of all proteins for each organism identified by a taxonomic ID.
+    \item \code{trim_refseq.R} Remove some entries with commonly occurring names (e.g. many different strains of Escherichia coli) to reduce size of \code{protein_refseq.csv} (to keep package size down).
     \item \code{protein_refseq.csv.xz} Output from above. See example for \code{\link{ZC}}.
     \item \code{taxid.names.R} Generate a table of scientific names for the provided taxids. Requires the complete \code{names.dmp} and \code{nodes.dmp} from NCBI taxonomy files.
     \item \code{taxid_names.csv.xz} Output from above. See example for \code{\link{id.blast}}.

Modified: pkg/CHNOSZ/man/protein.info.Rd
===================================================================
--- pkg/CHNOSZ/man/protein.info.Rd	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/man/protein.info.Rd	2013-03-24 07:04:53 UTC (rev 49)
@@ -133,11 +133,11 @@
 zc <- ZC(pf)
 # the organism names we search for
 # "" matches all organisms
-terms <- c("Halo", "Streptomyces", "Pseudomonas", "Salmonella",
-  "Escherichia", "Bacteroides", "Lactobacillus", "Staphylococcus",
-  "Streptococcus", "Methano", "Bacillus", "Thermo", "")
+terms <- c("Natr", "Halo", "Rhodo", "Acido", "Methylo",
+  "Nitro", "Desulfo", "Chloro", "Geo", "Methano",
+  "Thermo", "Pyro", "Sulfo", "Buchner", "")
 tps <- thermo$protein$ref[ip]
-plot(0, 0, xlim=c(1, 13), ylim=c(-0.3, -0.05), pch="",
+plot(0, 0, xlim=c(1, 15), ylim=c(-0.3, -0.05), pch="",
   ylab="average oxidation state of carbon in proteins",
   xlab="", xaxt="n", mar=c(6, 3, 1, 1))
 for(i in 1:length(terms)) {
@@ -145,8 +145,8 @@
   zct <- zc[it]
   points(jitter(rep(i, length(zct))), zct, pch=20)
 }
-terms[13] <- paste("all microbial")
-axis(1, 1:13, terms, las=2)
+terms[15] <- paste("all", length(ip))
+axis(1, 1:15, terms, las=2)
 title(main=paste("Average Oxidation State of Carbon:",
   "Total Protein per taxID in NCBI RefSeq", sep="\n"))
 }

Modified: pkg/CHNOSZ/man/sideeffects.Rd
===================================================================
--- pkg/CHNOSZ/man/sideeffects.Rd	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/man/sideeffects.Rd	2013-03-24 07:04:53 UTC (rev 49)
@@ -12,11 +12,18 @@
 
 The \samp{CHNOSZ} environment is \emph{not} (as of CHNOSZ 1.0.0) \code{\link{attach}ed}, rather the \code{thermo} object is accessed in functions using \code{\link{get}} (as in \code{get("thermo")}), \code{\link{assign}} (\code{assign("thermo", thermo, "CHNOSZ")}) and occasionally \code{\link{with}} (\code{with(as.environment("CHNOSZ"), ...)}).
 
-  In the functions in the package, the greatest number of accessions are to the thermodynamic database (\code{thermo$obigt}), followed by the basis and species definitions (\code{thermo$basis} and \code{thermo$species}). For example, \code{\link{info}} can be used to look up thermodynamic data in \code{thermo$obigt} by the name or chemical formula of a species. As another example, \code{\link{subcrt}} attempts to balance unbalanced chemical reactions with the user-defined basis species in \code{thermo$basis}.
+In the functions in the package, the greatest number of accessions are to the thermodynamic database (\code{thermo$obigt}), followed by the basis and species definitions (\code{thermo$basis} and \code{thermo$species}).
+For example, \code{\link{info}} can be used to look up thermodynamic data in \code{thermo$obigt} by the name or chemical formula of a species.
+As another example, \code{\link{subcrt}} attempts to balance unbalanced chemical reactions with the user-defined basis species in \code{thermo$basis}.
 
-  Some functions modify the thermodynamic database or system definition in \code{thermo}. These are examples of \dQuote{side effects}, since the functions have an effect on the state of the program that persists beyond the lifetime of the objects returned by the functions. In the code, side effects can be recognized by the presence of the \dQuote{superassignment} operator \code{\link{<<-}}. 
+Some functions modify the thermodynamic database or system definition in \code{thermo}.
+These are \dQuote{side effects}, since the functions have an effect on the state of the program that persists beyond the lifetime of the objects returned by the functions.
+In the code, side effects can be recognized by assignment to the \samp{thermo} object in the \samp{CHNOSZ} environment, i.e. \code{assign("thermo", thermo, "CHNOSZ")} (the unquoted \code{thermo} here refers to the object that was manipulated internally by a function and is now being assigned to the environment).
 
-  Side effects are not highly desirable in functional programming languages such as \R. The reason this design is adopted in CHNOSZ is that interactive use of \code{\link{basis}} and \code{\link{species}} appeared to the author, in the early stages of developing the package and of learning \R, to be facilitated by not requiring users to assign the results of these functions to objects. Instead, using side effects, the program \dQuote{remembers} the results of these function calls. Experience has shown that this design is usable (especially for new users), and is adaptable to many usage scenarios, but the dependence on side effects probably should be eliminated in the future.
+Side effects are not highly desirable in functional programming languages such as \R.
+The reason this design is adopted in CHNOSZ is that interactive use of \code{\link{basis}} and \code{\link{species}} appeared to the author, in the early stages of developing the package and of learning \R, to be facilitated by not requiring users to assign the results of these functions to objects.
+Instead, using side effects, the program \dQuote{remembers} the results of these function calls.
+Experience has shown that this design is usable (especially for new users), and is adaptable to many usage scenarios, but the dependence on side effects probably should be eliminated in the future.
 
  The two \emph{major} side effects, that most users will encounter, are the basis and species definitions. These functions and a few other modifications (writing) and accessions (reading) of data objects are listed below. The names of objects in this table refer to the components of the \code{thermo} object; for example, one can type \code{thermo$opt} at the command line to access all of the contents of the \code{opt} component, including those not listed in the table.
 
@@ -32,8 +39,10 @@
     \code{opar} \tab \code{\link{thermo.plot.new}} \tab -- \tab graphical parameters \cr
   }
 
-  Beginning with CHNOSZ version 0.9-9, instances of \code{\link{<<-}} in the code have the effect of \dQuote{superassignment} to an enclosing environment (\samp{CHNOSZ:thermo} on the search path), instead of the \emph{very} highly discouraged assignment to the global environment (user's workspace) used in previous versions of the package. This convention means that if, from the command line, \emph{you} wish to alter something in \code{thermo}, you also should use the \code{\link{<<-}} operator; examples of changing \code{thermo$opt$water} in this manner can found in the help page for \code{\link{water}}.
-  
+Beginning with CHNOSZ version 1.0.0, the \dQuote{superassignment} operator (\code{\link{<<-}}) is no longer used in functions. 
+However, if \emph{you} wish to alter something in \code{thermo} in an interactive session, it is recommended to use the \code{\link{<<-}} operator, instead of \code{\link{<-}}.
+This way, your changes to the \code{thermo} object occur in the \samp{CHNOSZ} environment, which is where the functions in CHNOSZ expect to find it, rather than being saved to the global environment.
+An example of changing \code{thermo$opt$water} in this manner can found in the help page for \code{\link{water}}.
 }
 
 \seealso{

Modified: pkg/CHNOSZ/man/util.blast.Rd
===================================================================
--- pkg/CHNOSZ/man/util.blast.Rd	2013-03-15 01:23:23 UTC (rev 48)
+++ pkg/CHNOSZ/man/util.blast.Rd	2013-03-24 07:04:53 UTC (rev 49)
@@ -76,7 +76,7 @@
 taxid.names <- read.csv(nfile)
 # the BLAST files
 sites <- c("N","S","R","Q","P")
-bfile <- paste("extdata/bison/bison", sites, "_vs_refseq55.blastp.xz", sep="")
+bfile <- paste("extdata/bison/bison", sites, "_vs_refseq57.blastp.xz", sep="")
 for(i in 1:5) {
   file <- system.file(bfile[i], package="CHNOSZ")
   # read the blast file, with default filtering settings