[CHNOSZ-commits] r57 - in pkg/CHNOSZ: . R inst inst/extdata/refseq man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sat Oct 5 09:17:54 CEST 2013
Author: jedick
Date: 2013-10-05 09:17:54 +0200 (Sat, 05 Oct 2013)
New Revision: 57
Modified:
pkg/CHNOSZ/DESCRIPTION
pkg/CHNOSZ/R/util.fasta.R
pkg/CHNOSZ/inst/NEWS
pkg/CHNOSZ/inst/extdata/refseq/README.txt
pkg/CHNOSZ/inst/extdata/refseq/gencat.sh
pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh
pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R
pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz
pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R
pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz
pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R
pkg/CHNOSZ/man/extdata.Rd
pkg/CHNOSZ/man/util.fasta.Rd
pkg/CHNOSZ/man/util.formula.Rd
Log:
update extdata/refseq to RefSeq 61
Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================
--- pkg/CHNOSZ/DESCRIPTION 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/DESCRIPTION 2013-10-05 07:17:54 UTC (rev 57)
@@ -1,6 +1,6 @@
-Date: 2013-09-22
+Date: 2013-10-05
Package: CHNOSZ
-Version: 1.0.1-1
+Version: 1.0.1-2
Title: Chemical Thermodynamics and Activity Diagrams
Author: Jeffrey M. Dick
Maintainer: Jeffrey Dick <j3ffdick at gmail.com>
Modified: pkg/CHNOSZ/R/util.fasta.R
===================================================================
--- pkg/CHNOSZ/R/util.fasta.R 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/R/util.fasta.R 2013-10-05 07:17:54 UTC (rev 57)
@@ -45,16 +45,18 @@
}
read.fasta <- function(file, i=NULL, ret="count", lines=NULL, ihead=NULL,
- start=NULL, stop=NULL, type="protein") {
+ start=NULL, stop=NULL, type="protein", id=NULL) {
# read sequences from a fasta file
# some of the following code was adapted from
# read.fasta in package seqinR
# value of 'i' is what sequences to read
# value of 'ret' determines format of return value:
- # count: amino acid composition (same columns as thermo$protein, can be used by add.protein)
+ # count: amino acid composition (same columns as thermo$protein, can be used by add.protein)
# or nucleic acid base composition (A-C-G-T)
- # seq: amino acid sequence
- # fas: fasta entry
+ # seq: amino acid sequence
+ # fas: fasta entry
+ # value of 'id' is used for 'protein' in output table,
+ # otherwise ID is parsed from FASTA header (can take a while)
is.nix <- Sys.info()[[1]]=="Linux"
if(is.nix & is.null(lines)) {
msgout("read.fasta: reading ",basename(file),"\n")
@@ -106,7 +108,7 @@
organism <- bnf
# protein/gene name is from header line for entry
# (strip the ">" and go to the first space or underscore)
- id <- as.character(palply(1:length(i), function(j) {
+ if(is.null(id)) id <- as.character(palply(1:length(i), function(j) {
# get the text of the line
f1 <- linefun(i[j],i[j])
# stop if the first character is not ">"
Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/NEWS 2013-10-05 07:17:54 UTC (rev 57)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 1.0.1-1 (2013-09-22)
+CHANGES IN CHNOSZ 1.0.1-2 (2013-10-05)
--------------------------------------
- Updated extdata/protein/Sce.csv.xz using Saccharomyces Genome Database
@@ -6,6 +6,14 @@
- more.aa() includes SGDID and gene name in the abbrv and ref columns.
+- In extdata/refseq, scripts and data files were updated for NCBI
+ Reference Sequence (RefSeq) release 61 (2013-09-09). Code was adapted
+ to deal with WP multispecies accessions.
+
+- read.fasta() gets new argument 'id'; when supplied, it skips reading
+ the protein names from the FASTA headers.
+
+
CHANGES IN CHNOSZ 1.0.1 (2013-07-04)
------------------------------------
Modified: pkg/CHNOSZ/inst/extdata/refseq/README.txt
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/README.txt 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/README.txt 2013-10-05 07:17:54 UTC (rev 57)
@@ -1,50 +1,47 @@
# the following data files support calculations using the
-# RefSeq database (release 57, 2013-01-08)
+# RefSeq database (release 61, 2013-09-09)
protein_refseq.csv: overall (average) amino acid composition of all proteins for each
- microbial genome in the RefSeq collection (n=7415)
-taxid_names.csv: taxid, phylum name and species name for 7415 microbial taxa
+ microbial genome in the RefSeq collection (n=6758)
+taxid_names.csv: taxid, phylum name and species name for 6758 microbial taxa
# these functions/scripts have the following purpose (output files listed in parentheses):
-mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq57.faa)
gencat.sh - extract gi number, taxid, sequence length from RefSeq release catalog (gi.taxid.txt)
protein.refseq.R - get average amino acid composition for each taxid from gzipped sequence files (protein_refseq.csv)
taxid.names.R - get taxonomic names for each taxid represented (taxid_names.csv)
+mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq61.faa)
# bash scripts assume a GNU/Linux-like operating system
-# timings were made for processing RefSeq 57 on a recent (2009) intel laptop
+# timings were made for processing RefSeq 61 on a recent (2009) intel laptop
# download stuff
-1. download 'release57.files.installed' and 'RefSeq-release57.catalog.gz' from NCBI
+1. download 'release61.files.installed', 'RefSeq-release61.catalog.gz',
+ and 'release61.multispecies_WP_accession_to_taxname.gz' from NCBI
(ftp://ftp.ncbi.nih.gov/refseq/release/release-catalog)
2. list URLS for the microbial protein sequence files:
- grep microbial.*.protein.faa* release57.files.installed | \
+ grep microbial.*.protein.faa* release61.files.installed | \
sed -e "s/^/ftp\:\/\/ftp.ncbi.nih.gov\/refseq\/release\/microbial\//g" > urllist
-3. download the files using 'wget -i urllist' [3227 files, 4.6 GB]
+3. download the files using 'wget -i urllist' [232 files, 5.5 GB]
4. move the .gz files to a directory named 'protein'
# protein stuff
-5. gzip -d RefSeq-release57.catalog.gz [3.1 GB]
-6. use 'gencat.sh' to generate gi.taxid.txt for microbial proteins in the catalog [7 minutes]
- for RefSeq57, 'cat gi.taxid.txt | wc -l' is 24488527
-7. generate protein_refseq.csv in R: [~19 hours]
+5. gzip -d RefSeq-release61.catalog.gz [3.5 GB]
+6. use 'gencat.sh' to generate gi.taxid.txt for microbial proteins in the catalog [3 minutes]
+ for RefSeq61, 'cat gi.taxid.txt | wc -l' is 28548891
+7. generate protein_refseq_complete.csv in R: [~14 hours]
> source("protein.refseq.R")
> protein.refseq()
note that this depends on gi.taxid.txt and the .faa.gz files in the 'protein' directory
-8. trim entries from protein_refseq.csv (smaller size, better for package distribution)
+8. trim entries to produce protein_refseq.csv (smaller size, better for package distribution)
> source("trim_refseq.R")
# taxonomy stuff
9. edit 'taxid.names.R' so that 'taxdir' points to the directory where the files
'names.dmp' and 'nodes.dmp' are present. these files can be downloaded from
- ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2013-01-15)
-10. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~4.5 hours]
+ ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2013-09-18)
+10. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~5.5 hours]
# BLAST stuff (optional)
11. run ls protein/*.gz > filelist
-12. use 'mkfaa.sh' to combine the sequences into a single file 'refseq57.faa' [9.3 GB, 11 minutes]
- for RefSeq57, 'grep "^>" refseq57.faa | wc -l' is 24477649
- the difference from the catalog (step 6 above), is 10878 sequences:
- taxid 1211777 (Rhizobium mesoamericanum STM3625) (6356 sequences)
- taxid 313627 (Bacillus sp. NRRL B-14911) (4522 sequences)
-13. make a BLAST database, e.g. formatdb -t refseq57 -i refseq57.faa -p T
-
+12. use 'mkfaa.sh' to combine the sequences into a single file 'refseq61.faa' [11 GB, 11 minutes]
+ for RefSeq61, 'grep "^>" refseq61.faa | wc -l' is 28548891 (same as catalog)
+13. make a BLAST database, e.g. formatdb -t refseq61 -i refseq61.faa -p T
Modified: pkg/CHNOSZ/inst/extdata/refseq/gencat.sh
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/gencat.sh 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/gencat.sh 2013-10-05 07:17:54 UTC (rev 57)
@@ -1,27 +1,19 @@
#/bin/sh
# extract microbial, genomic records from the RefSeq catalog
-RELEASE=57
-ORG=microbial
-MOL=protein
-BASENAME=RefSeq-release$RELEASE.catalog
+RELEASE=61
+INFILE=RefSeq-release$RELEASE.catalog
+OUTFILE=RefSeq-release$RELEASE.catalog.microbial.protein
-# extract the microbial records
-grep \|$ORG $BASENAME > $BASENAME.$ORG
+# extract the microbial protein records
+grep "[[:space:]]NP_.*microbial" $INFILE > $OUTFILE # 449849 records
+grep "[[:space:]]YP_.*microbial" $INFILE >> $OUTFILE # 7898382 records
+grep "[[:space:]]WP_.*microbial" $INFILE >> $OUTFILE # 20200660 records
-# extract the protein records
-# alternatively, could use egrep:
-#egrep "[[:space:]]AP_ | [[:space:]]NP_ | [[:space:]]XP_ | \
-# [[:space:]]YP_ | [[:space:]]ZP_" $BASENAME.$ORG > $BASENAME.$ORG.$MOL
-grep "[[:space:]]AP_" $BASENAME.$ORG > $BASENAME.$ORG.$MOL # 0 records
-grep "[[:space:]]NP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL # 450218 records
-grep "[[:space:]]XP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL # 0 records
-grep "[[:space:]]YP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL # 6786156 records
-grep "[[:space:]]ZP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL # 17252153 records
-
-# to save only the gi, taxid and sequence length columns, in that order
+# to save only the gi and taxid columns
# the field separator (tab) is defined in command line, not in awk program,
# otherwise the first line gets processed incorrectly
-cat $BASENAME.$ORG.$MOL | awk -F\\t '{print $4,$1,$7}' > gi.taxid.unsrt
+cat $OUTFILE | awk -F\\t '{print $4,$1}' > gi.taxid.unsrt
# sort the file on gi so that it can be used with e.g. the unix 'join' command
+# (note: for join to work, -n for numeric sort is *not* used here)
cat gi.taxid.unsrt | sort > gi.taxid.txt
Modified: pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh 2013-10-05 07:17:54 UTC (rev 57)
@@ -1,6 +1,6 @@
# send the contents of all the .faa.gz files to a single file ("OUTFILE")
-OUTFILE="refseq57.faa"
+OUTFILE="refseq61.faa"
FILELIST="filelist"
# start with an empty file
Modified: pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R 2013-10-05 07:17:54 UTC (rev 57)
@@ -1,129 +1,126 @@
# protein.refseq.R
-# calculate the average amino acid
-# composition of proteins for each taxid
-# 20100704 jmd
+# calculate the overall amino acid composition of proteins for each taxid in RefSeq
+# 20100704 first version
+# 20130922 deal with WP multispecies accessions (RefSeq61)
-# we need CHNOSZ for read.fasta
+# uses system "join" command
+# also, CHNOSZ for read.fasta, def2gi, etc.
require(CHNOSZ)
# where the microbial*.protein.faa.gz files are kept
faadir <- "protein"
-# read gi.taxid if it already doesn't exist
-if(!exists("gi.taxid")) {
- cat("reading gi.taxid.txt ... could take a while!\n")
- gi.taxid <- read.table("gi.taxid.txt")
- # we should make sure that all 3 columns are numeric (not factors)
+# read WP multispecies table if it doesn't already exist
+if(!exists("WP")) {
+ cat("reading WP multispecies table\n")
+ WP <- read.table(dir(pattern="multispecies_WP_accession_to_taxname.txt.gz"), sep="\t")
}
protein.refseq <- function(n=NULL) {
# the list of sequence files (microbial44.protein.fa.gz etc)
files <- dir(faadir)
# loop over each one, getting the contents
- if(is.null(n)) n <- length(files)
+ if(is.null(n)) n <- 1:length(files)
out <- NULL
- for(i in 1:n) {
- cat(paste("(",i,"of",n,") "))
- # check if it's a fasta file
- if(length(grep(".protein.faa.gz$",files[i]))==0) {
- cat(paste("skipping \'",files[i],"\' ...\n"))
+ for(i in n) {
+ cat(paste("(", i, " of ", length(n), ") ", sep=""))
+ # check if the name looks like a fasta file
+ if(length(grep(".protein.faa.gz$", files[i]))==0) {
+ cat(paste("skipping", files[i], "...\n"))
next
} else {
- cat(paste("reading \'",files[i],"\' ... "))
+ cat(paste("reading", files[i], "... "))
}
- # read the fasta file
- fa <- readLines(paste(faadir,files[i],sep="/"))
- # get the gi's for this file
- ihead.orig <- ihead <- grep("^>",fa)
- cat(paste("file has",length(ihead),"sequences\n"))
+ # read the file
+ fa <- readLines(paste(faadir, files[i], sep="/"))
+ # get the gi's from this file
+ ihead <- grep("^>", fa)
+ cat(paste("file has", length(ihead), "sequences\n"))
+ # leave the gi's as character to sort lexigraphically (for use with system join command)
gi <- def2gi(fa[ihead])
# now find the taxids that belong to these gi's
cat("finding matching taxids in catalog ...")
- gi.order <- order(as.character(gi))
- gi <- gi[gi.order]
- ihead <- ihead[gi.order]
- write.table(gi,"gi.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)
- system("join -1 1 -2 1 gi.txt gi.taxid.txt > gi.taxid.match")
- gi.taxid.match <- read.table("gi.taxid.match",header=FALSE)
- tax <- gi.taxid.match$V2
- utax <- unique(tax)
- cat(paste(" found",length(utax),"unique taxa\n"))
+ write.table(sort(gi), "gi.txt", row.names=FALSE, col.names=FALSE, quote=FALSE)
+ system("join -j 1 gi.txt gi.taxid.txt > gi.taxid.match")
+ gi.taxid <- read.table("gi.taxid.match", header=FALSE)
+ # produce an expanded table, replacing multispecies WP_ records with all corresponding taxids
+ is.multi <- gi.taxid$V1 %in% WP$V2
+ is.in.gi <- WP$V2 %in% gi.taxid$V1
+ # take out generic records
+ gi.taxid <- gi.taxid[!is.multi, ]
+ # add specific records
+ gi.taxid.WP <- WP[is.in.gi, ]
+ colnames(gi.taxid.WP) <- c("V0", "V1", "V2", "V3")
+ gi.taxid <- rbind(gi.taxid, gi.taxid.WP[, c(2, 3)])
+ # identify unique taxids
+ utax <- unique(gi.taxid$V2)
+ cat(paste(" found", length(utax), "unique taxa\n"))
# now loop over each taxon
for(j in 1:length(utax)) {
- jtax <- which(tax==utax[j])
- nseq.read <- length(jtax)
- cat(paste("taxon",utax[j],"has",nseq.read,"sequences"))
- # count the number of sequences in the catalog
- jjtax <- which(gi.taxid$V2==utax[j])
- nseq.cat <- length(jjtax)
- if(nseq.cat==nseq.read) cat(" (same as catalog)")
- else cat(paste(" WARNING: catlog has",nseq.cat))
+ # report on the taxon name and number of sequences
+ jtax <- which(gi.taxid$V2==utax[j])
+ nseq <- length(jtax)
+ cat(paste("taxon", utax[j], "has", nseq, "sequences "))
+ # match the gi numbers to their position in the fasta file
+ igi <- match(gi.taxid$V1[jtax], as.numeric(gi))
# get the amino acid compositions of the sequences
- aa <- read.fasta(file="",i=ihead[jtax],lines=fa,ihead=ihead.orig)
- naa.read <- sum(aa[,6:25])
- # count the number of amino acids in the catalog
- naa.cat <- sum(gi.taxid$V3[jjtax])
- if(naa.cat==naa.read) cat(paste("... all of",naa.cat,"amino acids "))
- else cat(paste("...",naa.read,"of",naa.cat,"amino acids "))
+ # use suppressMessages to hide mesages about unrecognized amino acid codes
+ # set id="refseq" to avoid parsing IDs from fasta header lines
+ aa <- suppressMessages(read.fasta(file="", i=ihead[igi], lines=fa, ihead=ihead, id="refseq"))
+ # the number of amino acids read from file
+ naa <- sum(aa[, 6:25])
# prepare the dataframe
# average amino acid composition
- aa[1,6:25] <- colSums(aa[,6:25])/nrow(aa)
- aa <- aa[1,]
- aa$protein <- "refseq"
+ aa[1, 6:25] <- colSums(aa[, 6:25])/nrow(aa)
+ aa <- aa[1, ]
aa$organism <- utax[j]
aa$chains <- 1
- # the number of amino acids read from files
- aa$abbrv <- naa.read
- # what is the name of this organism
- itax1 <- match(utax[j],gi.taxid.match$V2)
- gi1 <- gi.taxid.match$V1[itax1]
- igi1 <- match(gi1,gi)
- iihead <- ihead[igi1]
- orgname <- s2c(fa[iihead],sep=" [",keep.sep=TRUE)[2]
- orgname <- substr(orgname,2,nchar(orgname))
- cat(paste(orgname,"\n"))
- # numbers of sequences, amino acids, organism name
- aa <- cbind(aa,data.frame(nseq=nseq.read,naa=naa.read,nseq.cat=nseq.cat,naa.cat=naa.cat,orgname=orgname))
- # keep track of source file name
- # (append number of sequences according to the catalog)
- aa$ref <- paste(sub(".protein.faa.gz","",files[i]),"(",nseq.read,",",naa.read,")",sep="")
- if(j==1) aa.out <- aa else aa.out <- rbind(aa.out,aa)
+ aa$abbrv <- naa
+ # what is the name of this organism: look first at multispecies WP, then in gi header
+ jtax.WP <- match(utax[j], gi.taxid.WP$V2)
+ if(!is.na(jtax.WP)) orgname <- paste("[", gi.taxid.WP$V3[jtax.WP], "]", sep="")
+ else {
+ orgname <- s2c(fa[ihead[igi[1]]], sep=" [", keep.sep=TRUE)[2]
+ orgname <- substr(orgname, 2, nchar(orgname))
+ }
+ cat(paste(orgname, "\n"))
+ # append columns for numbers of sequences, organism name
+ aa <- cbind(aa, data.frame(nseq=nseq, orgname=orgname))
+ # keep track of source file name, and number of sequences and amino acids
+ fname <- sub("microbial.nonredundant_protein", "nonredundant", sub(".protein.faa.gz", "", files[i]))
+ aa$ref <- paste(fname, "(", nseq, ",", naa, ")", sep="")
+ if(j==1) aa.out <- aa else aa.out <- rbind(aa.out, aa)
}
- if(is.null(out)) out <- aa.out else out <- rbind(out,aa.out)
+ if(is.null(out)) out <- aa.out else out <- rbind(out, aa.out)
}
- # now we have to combine taxids that showed up more than once
- # e.g. microbial10;microbial9(2231743)
+
+ # now we have to combine taxids that showed up more than once (in different files)
duptax <- unique(out$organism[duplicated(out$organism)])
if(length(duptax) > 0) {
for(i in 1:length(duptax)) {
id <- which(out$organism==duptax[i])
# add them together, weighted by numbers of sequences
- aa <- colSums(out$nseq[id]*out[id,6:25])/sum(out$nseq[id])
+ aa <- colSums(out$nseq[id]*out[id, 6:25]) / sum(out$nseq[id])
# keep the result in the first row
- out[id[1],6:25] <- aa
+ out[id[1], 6:25] <- aa
# total number of sequences found in the files
out$nseq[id[1]] <- sum(out$nseq[id])
# total number of amino acids found in the sequence files
- out$naa[id[1]] <- sum(out$naa[id])
- out$abbrv[id[1]] <- out$naa[id[1]]
+ out$abbrv[id[1]] <- sum(out$abbrv[id])
# write down the file names
- out$ref[id[1]] <- c2s(out$ref[id],sep=";")
- # drop the duplicated ones
- out <- out[-id[2:length(id)],]
+ out$ref[id[1]] <- c2s(out$ref[id], sep=";")
+ # drop the rows after the first
+ out <- out[-id[2:length(id)], ]
}
}
- # summarize the organism name and numbers of
- # sequences/amino acids read vs. the catalog
+ # append the organism name to the numbers of sequences and amino acids read
for(i in 1:nrow(out)) {
- dnseq <- out$nseq[i] - out$nseq.cat[i]
- dnaa <- out$naa[i] - out$naa.cat[i]
- if(dnseq != 0 | dnaa != 0) out$ref[i] <- paste("[",dnseq,",",dnaa,"]",out$ref[i],sep="")
- out$ref[i] <- paste(out$ref[i],out$orgname[i],sep="")
+ out$ref[i] <- paste(out$ref[i], out$orgname[i], sep="")
}
# we can drop the extra columns now
- out <- out[,1:25]
+ out <- out[, 1:25]
# and do a little rounding
- out[,6:25] <- round(out[,6:25],3)
- write.csv(out,"protein_refseq.csv",row.names=FALSE)
+ out[, 6:25] <- round(out[, 6:25], 3)
+ write.csv(out, "protein_refseq_complete.csv", row.names=FALSE)
return(out)
}
Modified: pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz
===================================================================
(Binary files differ)
Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R 2013-10-05 07:17:54 UTC (rev 57)
@@ -2,11 +2,14 @@
# of species,genus,family,order,class,phylum,superkingdom
# for each of the microbial taxa in RefSeq database
+# uses functions in CHNOSZ to process taxonomy files
+require(CHNOSZ)
+
# change this to the location where names.dmp and nodes.dmp are located
taxdir <- "./taxdump"
# get the taxids from protein_refseq.csv
-pr <- read.csv("protein_refseq.csv.xz")
+pr <- read.csv("protein_refseq.csv")
taxid <- pr$organism
# read in the names and nodes
@@ -20,7 +23,7 @@
}
# what ranks we want to get
-ranks <- c("species","genus","family","order","class","phylum","superkingdom")
+ranks <- c("species", "genus", "family", "order", "class", "phylum", "superkingdom")
# start with an empty list
out <- rep(list(character()), length(ranks))
@@ -43,14 +46,14 @@
# get names of these parents
pnames <- sciname(pids[ip], taxdir, names=taxnames)
# add results to output list
- for(j in 1:length(ranks)) out[[j]] <- c(out[[j]],pnames[j])
+ for(j in 1:length(ranks)) out[[j]] <- c(out[[j]], pnames[j])
# report progress
- if(i %% 50 == 0) cat(paste(i,".. "))
+ if(i %% 50 == 0) cat(paste(i, ".. "))
}
# finish progress report
cat("done!\n")
# write results to a file
out <- as.data.frame(out)
-out <- cbind(data.frame(taxid=taxid[ii],out))
-write.csv(out,"taxid_names.csv",row.names=FALSE,quote=FALSE)
+out <- cbind(data.frame(taxid=taxid[ii], out))
+write.csv(out, "taxid_names.csv", row.names=FALSE, quote=FALSE)
Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz
===================================================================
(Binary files differ)
Modified: pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R
===================================================================
--- pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R 2013-10-05 07:17:54 UTC (rev 57)
@@ -1,66 +1,15 @@
-# trim the protein_refseq.csv, removing some entries with highly-represented names
+# trim the protein_refseq.csv,
+# keeping only organisms used in an example in ?protein.info
# (to keep the file size down for CHNOSZ package)
-# 20130320 jmd
-# the original file (7415 rows)
-pr <- read.csv("protein_refseq.csv")
-# the common names (comments show number in RefSeq 57
-names <- c(
- "Escherichia coli", # 662
- "Streptococcus", # 432
- "Salmonella", # 299
- "Staphylococcus", # 290
- "Enterococcus", # 218
- "Vibrio", # 190
- "Lactobacillus", # 179
- "Helicobacter", # 164
- "Pseudomonas", # 155
- "Mycobacterium", # 138
- "Campylobacter", # 131
- "Neisseria", # 121
- "Clostridium", # 118
- "Yersinia", # 111
- "Bacillus cereus", # 105
- "Acinetobacter", # 103
- "Propionibacterium", # 93
- "Burkholderia", # 91
- "Candidatus", # 83
- "Bacteroides", # 80
- "Mycoplasma", # 73
- "Streptomyces", # 72
- "Corynebacterium", # 70
- "Listeria", # 60
- "Leptospira", # 57
- "Klebsiella", # 57
- "Bifidobacterium", # 55
- "Brucella", # 53
- "Shigella", # 50
- "Haemophilus", # 47
- "Rickettsia", # 46
- "Prevotella", # 44
- "Chlamydia", # 42
- "Francisella", # 37
- "Bacillus thuringiensis", # 36
- "Borrelia", # 34
- "Fusobacterium", # 33
- "Xanthomonas", # 31
- "Rhizobium", # 27
- "Bartonella", # 26
- "Pseudoalteromonas", # 24
- "Bacillus anthracis", # 23
- "Actinomyces", # 23
- "Treponema", # 21
- "Actinobacillus", # 20
- "Aggregatibacter", # 20
- "Gardnerella" # 19
-)
-
-# loop over the names, identify rows, leave the first row
-for(name in names) {
- iname <- grep(name, pr$ref)
- iname <- tail(iname, -1)
- pr <- pr[-iname, ]
-}
-
-# we're left with 2600 rows
+# the original file
+pr <- read.csv("protein_refseq_complete.csv")
+# the terms we'll keep
+terms <- c("Natr", "Halo", "Rhodo", "Acido", "Methylo",
+ "Nitro", "Desulfo", "Chloro", "Geo", "Methano",
+ "Thermo", "Pyro", "Sulfo", "Buchner")
+# identify rows to keep
+iterm <- unique(unlist(sapply(terms, grep, pr$ref)))
+# extract data frame, write new file
+pr <- pr[iterm, ]
write.csv(pr, "protein_refseq.csv", row.names=FALSE)
Modified: pkg/CHNOSZ/man/extdata.Rd
===================================================================
--- pkg/CHNOSZ/man/extdata.Rd 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/man/extdata.Rd 2013-10-05 07:17:54 UTC (rev 57)
@@ -57,17 +57,20 @@
}
- Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, using RefSeq release 57 of 2013-01-08:
+ Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, using RefSeq release 61 of 2013-09-09:
\itemize{
\item \code{README.txt} Instructions for producing the data files.
\item \code{gencat.sh} Bash script to extract microbial protein records from the RefSeq catalog.
- \item \code{gi.taxid.txt} Output from above. The complete file is too large to distribute with CHNOSZ, but a portion is included in \code{extdata/bison} to support processing example BLAST files for the Bison Pool metagenome.
+ \item \code{gi.taxid.txt} Output from above. The complete file is too large to distribute with CHNOSZ, but a portion is included in \code{extdata/bison} to support processing example BLAST files for the Bison Pool metagenome (based on RefSeq 57, 2013-01-08).
\item \code{mkfaa.sh} Combine the contents of .faa.gz files into a single FASTA file (to use e.g. for making a BLAST database).
\item \code{protein.refseq.R} Calculate average amino acid composition of all proteins for each organism identified by a taxonomic ID.
- \item \code{trim_refseq.R} Remove some entries with commonly occurring names (e.g. many different strains of Escherichia coli) to reduce size of \code{protein_refseq.csv} (to keep package size down).
- \item \code{protein_refseq.csv.xz} Output from above. See example for \code{\link{ZC}}.
+ \item \code{trim_refseq.R} Keep only selected organism names (reduces number of taxa from 6758 to 779, helps to control package size).
+ \item \code{protein_refseq.csv.xz} Output from above. See example in \code{\link{protein.info}}.
\item \code{taxid.names.R} Generate a table of scientific names for the provided taxids. Requires the complete \code{names.dmp} and \code{nodes.dmp} from NCBI taxonomy files.
- \item \code{taxid_names.csv.xz} Output from above. See example for \code{\link{id.blast}}.
+ \item \code{taxid_names.csv.xz} Output from above.
+ NOTE: For backward compatibility with the example BLAST files for the Bison Pool metagenome, the packaged file merges records for taxids found in either RefSeq 57 or 61.
+ Certain taxids in release 57 were not located in the current RefSeq catalog, probably related to the transition to the \dQuote{WP} multispecies accessions (\url{ftp://ftp.ncbi.nlm.nih.gov/refseq/release/announcements/WP-proteins-06.10.2013.pdf}).
+ See example for \code{\link{id.blast}}.
}
Files in \code{taxonomy} contain example taxonomic data files:
Modified: pkg/CHNOSZ/man/util.fasta.Rd
===================================================================
--- pkg/CHNOSZ/man/util.fasta.Rd 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/man/util.fasta.Rd 2013-10-05 07:17:54 UTC (rev 57)
@@ -14,7 +14,7 @@
grep.file(file, pattern = "", y = NULL, ignore.case = TRUE,
startswith = ">", lines = NULL, grep = "grep")
read.fasta(file, i = NULL, ret = "count", lines = NULL,
- ihead = NULL, start=NULL, stop=NULL, type="protein")
+ ihead = NULL, start=NULL, stop=NULL, type="protein", id = NULL)
count.aa(seq, start=NULL, stop=NULL, type="protein")
uniprot.aa(protein, start=NULL, stop=NULL)
}
@@ -26,13 +26,14 @@
\item{ignore.case}{logical, ignore differences between upper- and lower-case?}
\item{startswith}{character, only lines starting with this expression are matched}
\item{lines}{list of character, supply the lines here instead of reading them from file}
- \item{grep}{character, name of system \code{grep} command}
+ \item{grep}{character, name of system \samp{grep} command}
\item{i}{numeric, line numbers of sequence headers to read}
\item{ret}{character, specification for type of return (count, sequence, or FASTA format)}
\item{ihead}{numeric, which lines are headers}
\item{start}{numeric, position in sequence to start counting}
\item{stop}{numeric, position in sequence to stop counting}
\item{type}{character, sequence type (protein or DNA)}
+ \item{id}{character, value to be used for \code{protein} in output table}
\item{seq}{character, amino acid sequence of a protein}
\item{protein}{character, entry name for protein in UniProt}
}
@@ -48,10 +49,12 @@
\code{read.fasta} is used to retrieve entries from a FASTA file.
To read only selected sequences pass the line numbers of the header lines to the function in \code{i} (they can be identified using e.g. \code{grep.file}).
-The function returns various formats depending on the value of \code{ret}; the default \samp{count} returns a dataframe of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences).
+The function returns various formats depending on the value of \code{ret}.
+The default \samp{count} returns a data frame of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences).
Similarly to \code{grep.file}, this function utilizes the OS's \samp{grep} on supported operating systems in order to identify the header lines as well as \samp{cat} to read the file, otherwise \code{\link{readLines}} and \R's \code{\link{substr}} are used to read the file and locate the header lines.
If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}.
Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to "").
+When \code{ret} is \samp{count}, the names of the proteins in the resulting data frame are parsed from the header lines of the file, unless \code{id} is provided.
\code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}).
For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}.
Modified: pkg/CHNOSZ/man/util.formula.Rd
===================================================================
--- pkg/CHNOSZ/man/util.formula.Rd 2013-09-22 10:22:25 UTC (rev 56)
+++ pkg/CHNOSZ/man/util.formula.Rd 2013-10-05 07:17:54 UTC (rev 57)
@@ -60,7 +60,9 @@
}
\seealso{
- \code{\link{makeup}}, used by \code{mass} and \code{entropy}, and \code{ZC} and \code{i2A} for counting the elements in a formula (the latter two make use of the \code{count.zero} argument). \code{\link{group.formulas}} (and by extension \code{\link{protein.formula}}) use the stoichiometric matrices created by \code{i2A}, as does \code{\link{run.wjd}}
+ \code{\link{makeup}}, used by \code{mass} and \code{entropy}, and \code{ZC} and \code{i2A} for counting the elements in a formula (the latter two make use of the \code{count.zero} argument).
+ \code{\link{group.formulas}} (and by extension \code{\link{protein.formula}}) use the stoichiometric matrices created by \code{i2A}, as does \code{\link{run.wjd}}.
+ \code{\link{protein.formula}} has an example of computing ZC for proteins compiled from the RefSeq database.
}
\examples{\dontshow{data(thermo)}
More information about the CHNOSZ-commits
mailing list