[CHNOSZ-commits] r57 - in pkg/CHNOSZ: . R inst inst/extdata/refseq man

Sat Oct 5 09:17:54 CEST 2013

Author: jedick
Date: 2013-10-05 09:17:54 +0200 (Sat, 05 Oct 2013)
New Revision: 57

Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/util.fasta.R
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/inst/extdata/refseq/README.txt
   pkg/CHNOSZ/inst/extdata/refseq/gencat.sh
   pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh
   pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R
   pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz
   pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R
   pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz
   pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R
   pkg/CHNOSZ/man/extdata.Rd
   pkg/CHNOSZ/man/util.fasta.Rd
   pkg/CHNOSZ/man/util.formula.Rd
Log:
update extdata/refseq to RefSeq 61


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/DESCRIPTION	2013-10-05 07:17:54 UTC (rev 57)
@@ -1,6 +1,6 @@
-Date: 2013-09-22
+Date: 2013-10-05
 Package: CHNOSZ
-Version: 1.0.1-1
+Version: 1.0.1-2
 Title: Chemical Thermodynamics and Activity Diagrams
 Author: Jeffrey M. Dick
 Maintainer: Jeffrey Dick <j3ffdick at gmail.com>

Modified: pkg/CHNOSZ/R/util.fasta.R
===================================================================
--- pkg/CHNOSZ/R/util.fasta.R	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/R/util.fasta.R	2013-10-05 07:17:54 UTC (rev 57)
@@ -45,16 +45,18 @@
 }
 
 read.fasta <- function(file, i=NULL, ret="count", lines=NULL, ihead=NULL,
-  start=NULL, stop=NULL, type="protein") {
+  start=NULL, stop=NULL, type="protein", id=NULL) {
   # read sequences from a fasta file
   # some of the following code was adapted from 
   # read.fasta in package seqinR
   # value of 'i' is what sequences to read 
   # value of 'ret' determines format of return value:
-  # count: amino acid composition (same columns as thermo$protein, can be used by add.protein)
+  #   count: amino acid composition (same columns as thermo$protein, can be used by add.protein)
   #        or nucleic acid base composition (A-C-G-T)
-  # seq: amino acid sequence
-  # fas: fasta entry
+  #   seq: amino acid sequence
+  #   fas: fasta entry
+  # value of 'id' is used for 'protein' in output table,
+  #   otherwise ID is parsed from FASTA header (can take a while)
   is.nix <- Sys.info()[[1]]=="Linux"
   if(is.nix & is.null(lines)) {
     msgout("read.fasta: reading ",basename(file),"\n")
@@ -106,7 +108,7 @@
   organism <- bnf
   # protein/gene name is from header line for entry
   # (strip the ">" and go to the first space or underscore)
-  id <- as.character(palply(1:length(i), function(j) {
+  if(is.null(id)) id <- as.character(palply(1:length(i), function(j) {
     # get the text of the line
     f1 <- linefun(i[j],i[j])
     # stop if the first character is not ">"

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/NEWS	2013-10-05 07:17:54 UTC (rev 57)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 1.0.1-1 (2013-09-22)
+CHANGES IN CHNOSZ 1.0.1-2 (2013-10-05)
 --------------------------------------
 
 - Updated extdata/protein/Sce.csv.xz using Saccharomyces Genome Database
@@ -6,6 +6,14 @@
 
 - more.aa() includes SGDID and gene name in the abbrv and ref columns.
 
+- In extdata/refseq, scripts and data files were updated for NCBI
+  Reference Sequence (RefSeq) release 61 (2013-09-09). Code was adapted
+  to deal with WP multispecies accessions.
+
+- read.fasta() gets new argument 'id'; when supplied, it skips reading
+  the protein names from the FASTA headers.
+
+
 CHANGES IN CHNOSZ 1.0.1 (2013-07-04)
 ------------------------------------
 

Modified: pkg/CHNOSZ/inst/extdata/refseq/README.txt
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/README.txt	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/README.txt	2013-10-05 07:17:54 UTC (rev 57)
@@ -1,50 +1,47 @@
 # the following data files support calculations using the 
-# RefSeq database (release 57, 2013-01-08)
+# RefSeq database (release 61, 2013-09-09)
 protein_refseq.csv: overall (average) amino acid composition of all proteins for each
-  microbial genome in the RefSeq collection (n=7415)
-taxid_names.csv: taxid, phylum name and species name for 7415 microbial taxa
+  microbial genome in the RefSeq collection (n=6758)
+taxid_names.csv: taxid, phylum name and species name for 6758 microbial taxa
 
 # these functions/scripts have the following purpose (output files listed in parentheses):
-mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq57.faa)
 gencat.sh - extract gi number, taxid, sequence length from RefSeq release catalog (gi.taxid.txt)
 protein.refseq.R - get average amino acid composition for each taxid from gzipped sequence files (protein_refseq.csv)
 taxid.names.R - get taxonomic names for each taxid represented (taxid_names.csv)
+mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq61.faa)
 
 # bash scripts assume a GNU/Linux-like operating system
-# timings were made for processing RefSeq 57 on a recent (2009) intel laptop
+# timings were made for processing RefSeq 61 on a recent (2009) intel laptop
 
 # download stuff
-1. download 'release57.files.installed' and 'RefSeq-release57.catalog.gz' from NCBI
+1. download 'release61.files.installed', 'RefSeq-release61.catalog.gz',
+   and 'release61.multispecies_WP_accession_to_taxname.gz' from NCBI
    (ftp://ftp.ncbi.nih.gov/refseq/release/release-catalog)
 2. list URLS for the microbial protein sequence files:
-     grep microbial.*.protein.faa* release57.files.installed | \
+     grep microbial.*.protein.faa* release61.files.installed | \
        sed -e "s/^/ftp\:\/\/ftp.ncbi.nih.gov\/refseq\/release\/microbial\//g" > urllist
-3. download the files using 'wget -i urllist' [3227 files, 4.6 GB]
+3. download the files using 'wget -i urllist' [232 files, 5.5 GB]
 4. move the .gz files to a directory named 'protein'
 
 # protein stuff
-5. gzip -d RefSeq-release57.catalog.gz [3.1 GB]
-6. use 'gencat.sh' to generate gi.taxid.txt for microbial proteins in the catalog [7 minutes]
-   for RefSeq57, 'cat gi.taxid.txt | wc -l' is 24488527
-7. generate protein_refseq.csv in R:  [~19 hours]
+5. gzip -d RefSeq-release61.catalog.gz [3.5 GB]
+6. use 'gencat.sh' to generate gi.taxid.txt for microbial proteins in the catalog [3 minutes]
+   for RefSeq61, 'cat gi.taxid.txt | wc -l' is 28548891
+7. generate protein_refseq_complete.csv in R:  [~14 hours]
    > source("protein.refseq.R")
    > protein.refseq()
    note that this depends on gi.taxid.txt and the .faa.gz files in the 'protein' directory
-8. trim entries from protein_refseq.csv (smaller size, better for package distribution)
+8. trim entries to produce protein_refseq.csv (smaller size, better for package distribution)
    > source("trim_refseq.R")
 
 # taxonomy stuff
 9. edit 'taxid.names.R' so that 'taxdir' points to the directory where the files
     'names.dmp' and 'nodes.dmp' are present. these files can be downloaded from
-    ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2013-01-15)
-10. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~4.5 hours]
+    ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2013-09-18)
+10. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~5.5 hours]
 
 # BLAST stuff (optional)
 11. run ls protein/*.gz > filelist
-12. use 'mkfaa.sh' to combine the sequences into a single file 'refseq57.faa' [9.3 GB, 11 minutes]
-    for RefSeq57, 'grep "^>" refseq57.faa | wc -l' is 24477649
-    the difference from the catalog (step 6 above), is 10878 sequences:
-    taxid 1211777 (Rhizobium mesoamericanum STM3625) (6356 sequences) 
-    taxid 313627 (Bacillus sp. NRRL B-14911) (4522 sequences)
-13. make a BLAST database, e.g. formatdb -t refseq57 -i refseq57.faa -p T
-
+12. use 'mkfaa.sh' to combine the sequences into a single file 'refseq61.faa' [11 GB, 11 minutes]
+    for RefSeq61, 'grep "^>" refseq61.faa | wc -l' is 28548891 (same as catalog)
+13. make a BLAST database, e.g. formatdb -t refseq61 -i refseq61.faa -p T

Modified: pkg/CHNOSZ/inst/extdata/refseq/gencat.sh
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/gencat.sh	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/gencat.sh	2013-10-05 07:17:54 UTC (rev 57)
@@ -1,27 +1,19 @@
 #/bin/sh
 # extract microbial, genomic records from the RefSeq catalog
-RELEASE=57
-ORG=microbial
-MOL=protein
-BASENAME=RefSeq-release$RELEASE.catalog 
+RELEASE=61
+INFILE=RefSeq-release$RELEASE.catalog 
+OUTFILE=RefSeq-release$RELEASE.catalog.microbial.protein
 
-# extract the microbial records
-grep \|$ORG $BASENAME  > $BASENAME.$ORG
+# extract the microbial protein records
+grep "[[:space:]]NP_.*microbial" $INFILE  > $OUTFILE  # 449849 records
+grep "[[:space:]]YP_.*microbial" $INFILE >> $OUTFILE  # 7898382 records
+grep "[[:space:]]WP_.*microbial" $INFILE >> $OUTFILE  # 20200660 records
 
-# extract the protein records
-# alternatively, could use egrep:
-#egrep "[[:space:]]AP_ | [[:space:]]NP_ | [[:space:]]XP_ | \
-#  [[:space:]]YP_ | [[:space:]]ZP_" $BASENAME.$ORG  > $BASENAME.$ORG.$MOL
-grep "[[:space:]]AP_" $BASENAME.$ORG  > $BASENAME.$ORG.$MOL  # 0 records
-grep "[[:space:]]NP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL  # 450218 records
-grep "[[:space:]]XP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL  # 0 records
-grep "[[:space:]]YP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL  # 6786156 records
-grep "[[:space:]]ZP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL  # 17252153 records
-
-# to save only the gi, taxid and sequence length columns, in that order
+# to save only the gi and taxid columns
 # the field separator (tab) is defined in command line, not in awk program,
 #   otherwise the first line gets processed incorrectly
-cat $BASENAME.$ORG.$MOL | awk -F\\t '{print $4,$1,$7}' > gi.taxid.unsrt
+cat $OUTFILE | awk -F\\t '{print $4,$1}' > gi.taxid.unsrt
 
 # sort the file on gi so that it can be used with e.g. the unix 'join' command
+# (note: for join to work, -n for numeric sort is *not* used here)
 cat gi.taxid.unsrt | sort > gi.taxid.txt

Modified: pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh	2013-10-05 07:17:54 UTC (rev 57)
@@ -1,6 +1,6 @@
 # send the contents of all the .faa.gz files to a single file ("OUTFILE")
 
-OUTFILE="refseq57.faa"
+OUTFILE="refseq61.faa"
 FILELIST="filelist"
 
 # start with an empty file

Modified: pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R	2013-10-05 07:17:54 UTC (rev 57)
@@ -1,129 +1,126 @@
 # protein.refseq.R
-# calculate the average amino acid
-# composition of proteins for each taxid
-# 20100704 jmd
+# calculate the overall amino acid composition of proteins for each taxid in RefSeq
+# 20100704 first version
+# 20130922 deal with WP multispecies accessions (RefSeq61)
 
-# we need CHNOSZ for read.fasta
+# uses system "join" command
+# also, CHNOSZ for read.fasta, def2gi, etc.
 require(CHNOSZ)
 
 # where the microbial*.protein.faa.gz files are kept
 faadir <- "protein"
 
-# read gi.taxid if it already doesn't exist
-if(!exists("gi.taxid")) {
-  cat("reading gi.taxid.txt ... could take a while!\n")
-  gi.taxid <- read.table("gi.taxid.txt")
-  # we should make sure that all 3 columns are numeric (not factors)
+# read WP multispecies table if it doesn't already exist
+if(!exists("WP")) {
+  cat("reading WP multispecies table\n")
+  WP <- read.table(dir(pattern="multispecies_WP_accession_to_taxname.txt.gz"), sep="\t")
 }
 
 protein.refseq <- function(n=NULL) {
   # the list of sequence files (microbial44.protein.fa.gz etc)
   files <- dir(faadir)
   # loop over each one, getting the contents
-  if(is.null(n)) n <- length(files)
+  if(is.null(n)) n <- 1:length(files)
   out <- NULL
-  for(i in 1:n) {
-    cat(paste("(",i,"of",n,") "))
-    # check if it's a fasta file
-    if(length(grep(".protein.faa.gz$",files[i]))==0) {
-      cat(paste("skipping \'",files[i],"\' ...\n"))
+  for(i in n) {
+    cat(paste("(", i, " of ", length(n), ") ", sep=""))
+    # check if the name looks like a fasta file
+    if(length(grep(".protein.faa.gz$", files[i]))==0) {
+      cat(paste("skipping", files[i], "...\n"))
       next
     } else { 
-      cat(paste("reading \'",files[i],"\' ... "))
+      cat(paste("reading", files[i], "... "))
     }
-    # read the fasta file
-    fa <- readLines(paste(faadir,files[i],sep="/"))
-    # get the gi's for this file
-    ihead.orig <- ihead <- grep("^>",fa)
-    cat(paste("file has",length(ihead),"sequences\n"))
+    # read the file
+    fa <- readLines(paste(faadir, files[i], sep="/"))
+    # get the gi's from this file
+    ihead <- grep("^>", fa)
+    cat(paste("file has", length(ihead), "sequences\n"))
+    # leave the gi's as character to sort lexigraphically (for use with system join command)
     gi <- def2gi(fa[ihead])
     # now find the taxids that belong to these gi's
     cat("finding matching taxids in catalog ...")
-    gi.order <- order(as.character(gi))
-    gi <- gi[gi.order]
-    ihead <- ihead[gi.order]
-    write.table(gi,"gi.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)
-    system("join -1 1 -2 1 gi.txt gi.taxid.txt > gi.taxid.match")
-    gi.taxid.match <- read.table("gi.taxid.match",header=FALSE)
-    tax <- gi.taxid.match$V2
-    utax <- unique(tax)
-    cat(paste(" found",length(utax),"unique taxa\n"))
+    write.table(sort(gi), "gi.txt", row.names=FALSE, col.names=FALSE, quote=FALSE)
+    system("join -j 1 gi.txt gi.taxid.txt > gi.taxid.match")
+    gi.taxid <- read.table("gi.taxid.match", header=FALSE)
+    # produce an expanded table, replacing multispecies WP_ records with all corresponding taxids
+    is.multi <- gi.taxid$V1 %in% WP$V2
+    is.in.gi <- WP$V2 %in% gi.taxid$V1
+    # take out generic records
+    gi.taxid <- gi.taxid[!is.multi, ]
+    # add specific records
+    gi.taxid.WP <- WP[is.in.gi, ]
+    colnames(gi.taxid.WP) <- c("V0", "V1", "V2", "V3")
+    gi.taxid <- rbind(gi.taxid, gi.taxid.WP[, c(2, 3)])
+    # identify unique taxids
+    utax <- unique(gi.taxid$V2)
+    cat(paste(" found", length(utax), "unique taxa\n"))
     # now loop over each taxon
     for(j in 1:length(utax)) {
-      jtax <- which(tax==utax[j])
-      nseq.read <- length(jtax)
-      cat(paste("taxon",utax[j],"has",nseq.read,"sequences"))
-      # count the number of sequences in the catalog
-      jjtax <- which(gi.taxid$V2==utax[j])
-      nseq.cat <- length(jjtax)
-      if(nseq.cat==nseq.read) cat(" (same as catalog)")
-      else cat(paste(" WARNING: catlog has",nseq.cat))
+      # report on the taxon name and number of sequences
+      jtax <- which(gi.taxid$V2==utax[j])
+      nseq <- length(jtax)
+      cat(paste("taxon", utax[j], "has", nseq, "sequences "))
+      # match the gi numbers to their position in the fasta file
+      igi <- match(gi.taxid$V1[jtax], as.numeric(gi))
       # get the amino acid compositions of the sequences
-      aa <- read.fasta(file="",i=ihead[jtax],lines=fa,ihead=ihead.orig)
-      naa.read <- sum(aa[,6:25])
-      # count the number of amino acids in the catalog
-      naa.cat <- sum(gi.taxid$V3[jjtax])
-      if(naa.cat==naa.read) cat(paste("... all of",naa.cat,"amino acids "))
-      else cat(paste("...",naa.read,"of",naa.cat,"amino acids "))
+      # use suppressMessages to hide mesages about unrecognized amino acid codes
+      # set id="refseq" to avoid parsing IDs from fasta header lines
+      aa <- suppressMessages(read.fasta(file="", i=ihead[igi], lines=fa, ihead=ihead, id="refseq"))
+      # the number of amino acids read from file
+      naa <- sum(aa[, 6:25])
       # prepare the dataframe
       # average amino acid composition
-      aa[1,6:25] <- colSums(aa[,6:25])/nrow(aa)
-      aa <- aa[1,]
-      aa$protein <- "refseq"
+      aa[1, 6:25] <- colSums(aa[, 6:25])/nrow(aa)
+      aa <- aa[1, ]
       aa$organism <- utax[j]
       aa$chains <- 1
-      # the number of amino acids read from files
-      aa$abbrv <- naa.read
-      # what is the name of this organism
-      itax1 <- match(utax[j],gi.taxid.match$V2)
-      gi1 <- gi.taxid.match$V1[itax1]
-      igi1 <- match(gi1,gi)
-      iihead <- ihead[igi1]
-      orgname <- s2c(fa[iihead],sep=" [",keep.sep=TRUE)[2]
-      orgname <- substr(orgname,2,nchar(orgname))
-      cat(paste(orgname,"\n"))
-      # numbers of sequences, amino acids, organism name
-      aa <- cbind(aa,data.frame(nseq=nseq.read,naa=naa.read,nseq.cat=nseq.cat,naa.cat=naa.cat,orgname=orgname))
-      # keep track of source file name 
-      # (append number of sequences according to the catalog)
-      aa$ref <- paste(sub(".protein.faa.gz","",files[i]),"(",nseq.read,",",naa.read,")",sep="")
-      if(j==1) aa.out <- aa else aa.out <- rbind(aa.out,aa)
+      aa$abbrv <- naa
+      # what is the name of this organism: look first at multispecies WP, then in gi header
+      jtax.WP <- match(utax[j], gi.taxid.WP$V2)
+      if(!is.na(jtax.WP)) orgname <- paste("[", gi.taxid.WP$V3[jtax.WP], "]", sep="")
+      else {
+        orgname <- s2c(fa[ihead[igi[1]]], sep=" [", keep.sep=TRUE)[2]
+        orgname <- substr(orgname, 2, nchar(orgname))
+      }
+      cat(paste(orgname, "\n"))
+      # append columns for numbers of sequences, organism name
+      aa <- cbind(aa, data.frame(nseq=nseq, orgname=orgname))
+      # keep track of source file name, and number of sequences and amino acids
+      fname <- sub("microbial.nonredundant_protein", "nonredundant", sub(".protein.faa.gz", "", files[i]))
+      aa$ref <- paste(fname, "(", nseq, ",", naa, ")", sep="")
+      if(j==1) aa.out <- aa else aa.out <- rbind(aa.out, aa)
     }
-    if(is.null(out)) out <- aa.out else out <- rbind(out,aa.out)
+    if(is.null(out)) out <- aa.out else out <- rbind(out, aa.out)
   }
-  # now we have to combine taxids that showed up more than once
-  # e.g. microbial10;microbial9(2231743)
+
+  # now we have to combine taxids that showed up more than once (in different files)
   duptax <- unique(out$organism[duplicated(out$organism)])
   if(length(duptax) > 0) {
     for(i in 1:length(duptax)) {
       id <- which(out$organism==duptax[i])
       # add them together, weighted by numbers of sequences
-      aa <- colSums(out$nseq[id]*out[id,6:25])/sum(out$nseq[id])
+      aa <- colSums(out$nseq[id]*out[id, 6:25]) / sum(out$nseq[id])
       # keep the result in the first row
-      out[id[1],6:25] <- aa
+      out[id[1], 6:25] <- aa
       # total number of sequences found in the files
       out$nseq[id[1]] <- sum(out$nseq[id])
       # total number of amino acids found in the sequence files
-      out$naa[id[1]] <- sum(out$naa[id])
-      out$abbrv[id[1]] <- out$naa[id[1]]
+      out$abbrv[id[1]] <- sum(out$abbrv[id])
       # write down the file names 
-      out$ref[id[1]] <- c2s(out$ref[id],sep=";")
-      # drop the duplicated ones
-      out <- out[-id[2:length(id)],]
+      out$ref[id[1]] <- c2s(out$ref[id], sep=";")
+      # drop the rows after the first
+      out <- out[-id[2:length(id)], ]
     }
   }
-  # summarize the organism name and numbers of
-  # sequences/amino acids read vs. the catalog
+  # append the organism name to the numbers of sequences and amino acids read
   for(i in 1:nrow(out)) {
-    dnseq <- out$nseq[i] - out$nseq.cat[i]
-    dnaa <- out$naa[i] - out$naa.cat[i]
-    if(dnseq != 0 | dnaa != 0) out$ref[i] <- paste("[",dnseq,",",dnaa,"]",out$ref[i],sep="")
-    out$ref[i] <- paste(out$ref[i],out$orgname[i],sep="")
+    out$ref[i] <- paste(out$ref[i], out$orgname[i], sep="")
   }
   # we can drop the extra columns now
-  out <- out[,1:25]
+  out <- out[, 1:25]
   # and do a little rounding
-  out[,6:25] <- round(out[,6:25],3)
-  write.csv(out,"protein_refseq.csv",row.names=FALSE)
+  out[, 6:25] <- round(out[, 6:25], 3)
+  write.csv(out, "protein_refseq_complete.csv", row.names=FALSE)
   return(out)
 }

Modified: pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz
===================================================================
(Binary files differ)

Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R	2013-10-05 07:17:54 UTC (rev 57)
@@ -2,11 +2,14 @@
 # of species,genus,family,order,class,phylum,superkingdom
 # for each of the microbial taxa in RefSeq database
 
+# uses functions in CHNOSZ to process taxonomy files
+require(CHNOSZ)
+
 # change this to the location where names.dmp and nodes.dmp are located
 taxdir <- "./taxdump"
 
 # get the taxids from protein_refseq.csv
-pr <- read.csv("protein_refseq.csv.xz")
+pr <- read.csv("protein_refseq.csv")
 taxid <- pr$organism
 
 # read in the names and nodes
@@ -20,7 +23,7 @@
 }
 
 # what ranks we want to get
-ranks <- c("species","genus","family","order","class","phylum","superkingdom")
+ranks <- c("species", "genus", "family", "order", "class", "phylum", "superkingdom")
 
 # start with an empty list
 out <- rep(list(character()), length(ranks))
@@ -43,14 +46,14 @@
   # get names of these parents
   pnames <- sciname(pids[ip], taxdir, names=taxnames)
   # add results to output list
-  for(j in 1:length(ranks)) out[[j]] <- c(out[[j]],pnames[j])
+  for(j in 1:length(ranks)) out[[j]] <- c(out[[j]], pnames[j])
   # report progress
-  if(i %% 50 == 0) cat(paste(i,".. "))
+  if(i %% 50 == 0) cat(paste(i, ".. "))
 }
 # finish progress report
 cat("done!\n")
 
 # write results to a file
 out <- as.data.frame(out)
-out <- cbind(data.frame(taxid=taxid[ii],out))
-write.csv(out,"taxid_names.csv",row.names=FALSE,quote=FALSE)
+out <- cbind(data.frame(taxid=taxid[ii], out))
+write.csv(out, "taxid_names.csv", row.names=FALSE, quote=FALSE)

Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz
===================================================================
(Binary files differ)

Modified: pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R	2013-10-05 07:17:54 UTC (rev 57)
@@ -1,66 +1,15 @@
-# trim the protein_refseq.csv, removing some entries with highly-represented names
+# trim the protein_refseq.csv,
+# keeping only organisms used in an example in ?protein.info
 # (to keep the file size down for CHNOSZ package)
-# 20130320 jmd
 
-# the original file (7415 rows)
-pr <- read.csv("protein_refseq.csv")
-# the common names (comments show number in RefSeq 57
-names <- c(
-  "Escherichia coli",  # 662
-  "Streptococcus",     # 432
-  "Salmonella",        # 299
-  "Staphylococcus",    # 290
-  "Enterococcus",      # 218
-  "Vibrio",            # 190
-  "Lactobacillus",     # 179
-  "Helicobacter",      # 164
-  "Pseudomonas",       # 155
-  "Mycobacterium",     # 138
-  "Campylobacter",     # 131
-  "Neisseria",         # 121
-  "Clostridium",       # 118
-  "Yersinia",          # 111
-  "Bacillus cereus",   # 105
-  "Acinetobacter",     # 103
-  "Propionibacterium", # 93
-  "Burkholderia",      # 91
-  "Candidatus",        # 83
-  "Bacteroides",       # 80
-  "Mycoplasma",        # 73
-  "Streptomyces",      # 72
-  "Corynebacterium",   # 70
-  "Listeria",          # 60
-  "Leptospira",        # 57
-  "Klebsiella",        # 57
-  "Bifidobacterium",   # 55
-  "Brucella",          # 53
-  "Shigella",          # 50
-  "Haemophilus",       # 47
-  "Rickettsia",        # 46
-  "Prevotella",        # 44
-  "Chlamydia",         # 42
-  "Francisella",       # 37
-  "Bacillus thuringiensis", # 36
-  "Borrelia",          # 34
-  "Fusobacterium",     # 33
-  "Xanthomonas",       # 31
-  "Rhizobium",         # 27
-  "Bartonella",        # 26
-  "Pseudoalteromonas", # 24
-  "Bacillus anthracis", # 23
-  "Actinomyces",       # 23
-  "Treponema",         # 21
-  "Actinobacillus",    # 20
-  "Aggregatibacter",   # 20
-  "Gardnerella"        # 19
-)
-
-# loop over the names, identify rows, leave the first row
-for(name in names) {
-  iname <- grep(name, pr$ref)
-  iname <- tail(iname, -1)
-  pr <- pr[-iname, ]
-}
-
-# we're left with 2600 rows 
+# the original file
+pr <- read.csv("protein_refseq_complete.csv")
+# the terms we'll keep
+terms <- c("Natr", "Halo", "Rhodo", "Acido", "Methylo",
+  "Nitro", "Desulfo", "Chloro", "Geo", "Methano",
+  "Thermo", "Pyro", "Sulfo", "Buchner")
+# identify rows to keep
+iterm <- unique(unlist(sapply(terms, grep, pr$ref)))
+# extract data frame, write new file
+pr <- pr[iterm, ]
 write.csv(pr, "protein_refseq.csv", row.names=FALSE)

Modified: pkg/CHNOSZ/man/extdata.Rd
===================================================================
--- pkg/CHNOSZ/man/extdata.Rd	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/man/extdata.Rd	2013-10-05 07:17:54 UTC (rev 57)
@@ -57,17 +57,20 @@
   }
 
 
-  Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, using RefSeq release 57 of 2013-01-08:
+  Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, using RefSeq release 61 of 2013-09-09:
   \itemize{
     \item \code{README.txt} Instructions for producing the data files.
     \item \code{gencat.sh} Bash script to extract microbial protein records from the RefSeq catalog.
-    \item \code{gi.taxid.txt} Output from above. The complete file is too large to distribute with CHNOSZ, but a portion is included in \code{extdata/bison} to support processing example BLAST files for the Bison Pool metagenome.
+    \item \code{gi.taxid.txt} Output from above. The complete file is too large to distribute with CHNOSZ, but a portion is included in \code{extdata/bison} to support processing example BLAST files for the Bison Pool metagenome (based on RefSeq 57, 2013-01-08).
     \item \code{mkfaa.sh} Combine the contents of .faa.gz files into a single FASTA file (to use e.g. for making a BLAST database).
     \item \code{protein.refseq.R} Calculate average amino acid composition of all proteins for each organism identified by a taxonomic ID.
-    \item \code{trim_refseq.R} Remove some entries with commonly occurring names (e.g. many different strains of Escherichia coli) to reduce size of \code{protein_refseq.csv} (to keep package size down).
-    \item \code{protein_refseq.csv.xz} Output from above. See example for \code{\link{ZC}}.
+    \item \code{trim_refseq.R} Keep only selected organism names (reduces number of taxa from 6758 to 779, helps to control package size).
+    \item \code{protein_refseq.csv.xz} Output from above. See example in \code{\link{protein.info}}.
     \item \code{taxid.names.R} Generate a table of scientific names for the provided taxids. Requires the complete \code{names.dmp} and \code{nodes.dmp} from NCBI taxonomy files.
-    \item \code{taxid_names.csv.xz} Output from above. See example for \code{\link{id.blast}}.
+    \item \code{taxid_names.csv.xz} Output from above.
+      NOTE: For backward compatibility with the example BLAST files for the Bison Pool metagenome, the packaged file merges records for taxids found in either RefSeq 57 or 61.
+      Certain taxids in release 57 were not located in the current RefSeq catalog, probably related to the transition to the \dQuote{WP} multispecies accessions (\url{ftp://ftp.ncbi.nlm.nih.gov/refseq/release/announcements/WP-proteins-06.10.2013.pdf}).
+      See example for \code{\link{id.blast}}.
   }
 
   Files in \code{taxonomy} contain example taxonomic data files:

Modified: pkg/CHNOSZ/man/util.fasta.Rd
===================================================================
--- pkg/CHNOSZ/man/util.fasta.Rd	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/man/util.fasta.Rd	2013-10-05 07:17:54 UTC (rev 57)
@@ -14,7 +14,7 @@
   grep.file(file, pattern = "", y = NULL, ignore.case = TRUE, 
     startswith = ">", lines = NULL, grep = "grep")
   read.fasta(file, i = NULL, ret = "count", lines = NULL, 
-    ihead = NULL, start=NULL, stop=NULL, type="protein")
+    ihead = NULL, start=NULL, stop=NULL, type="protein", id = NULL)
   count.aa(seq, start=NULL, stop=NULL, type="protein")
   uniprot.aa(protein, start=NULL, stop=NULL)
 }
@@ -26,13 +26,14 @@
   \item{ignore.case}{logical, ignore differences between upper- and lower-case?}
   \item{startswith}{character, only lines starting with this expression are matched}
   \item{lines}{list of character, supply the lines here instead of reading them from file}
-  \item{grep}{character, name of system \code{grep} command}
+  \item{grep}{character, name of system \samp{grep} command}
   \item{i}{numeric, line numbers of sequence headers to read}
   \item{ret}{character, specification for type of return (count, sequence, or FASTA format)}
   \item{ihead}{numeric, which lines are headers}
   \item{start}{numeric, position in sequence to start counting}
   \item{stop}{numeric, position in sequence to stop counting}
   \item{type}{character, sequence type (protein or DNA)}
+  \item{id}{character, value to be used for \code{protein} in output table}
   \item{seq}{character, amino acid sequence of a protein}
   \item{protein}{character, entry name for protein in UniProt}
 }
@@ -48,10 +49,12 @@
 
 \code{read.fasta} is used to retrieve entries from a FASTA file.
 To read only selected sequences pass the line numbers of the header lines to the function in \code{i} (they can be identified using e.g. \code{grep.file}).
-The function returns various formats depending on the value of \code{ret}; the default \samp{count} returns a dataframe of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences).
+The function returns various formats depending on the value of \code{ret}.
+The default \samp{count} returns a data frame of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences).
 Similarly to \code{grep.file}, this function utilizes the OS's \samp{grep} on supported operating systems in order to identify the header lines as well as \samp{cat} to read the file, otherwise \code{\link{readLines}} and \R's \code{\link{substr}} are used to read the file and locate the header lines.
 If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}. 
 Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to "").
+When \code{ret} is \samp{count}, the names of the proteins in the resulting data frame are parsed from the header lines of the file, unless \code{id} is provided.
 
 \code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}).
 For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}.

Modified: pkg/CHNOSZ/man/util.formula.Rd
===================================================================
--- pkg/CHNOSZ/man/util.formula.Rd	2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/man/util.formula.Rd	2013-10-05 07:17:54 UTC (rev 57)
@@ -60,7 +60,9 @@
 }
 
 \seealso{
-  \code{\link{makeup}}, used by \code{mass} and \code{entropy}, and \code{ZC} and \code{i2A} for counting the elements in a formula (the latter two make use of the \code{count.zero} argument). \code{\link{group.formulas}} (and by extension \code{\link{protein.formula}}) use the stoichiometric matrices created by \code{i2A}, as does \code{\link{run.wjd}}
+  \code{\link{makeup}}, used by \code{mass} and \code{entropy}, and \code{ZC} and \code{i2A} for counting the elements in a formula (the latter two make use of the \code{count.zero} argument).
+  \code{\link{group.formulas}} (and by extension \code{\link{protein.formula}}) use the stoichiometric matrices created by \code{i2A}, as does \code{\link{run.wjd}}.
+  \code{\link{protein.formula}} has an example of computing ZC for proteins compiled from the RefSeq database.
 }
 
 \examples{\dontshow{data(thermo)}