From noreply at r-forge.r-project.org Sat Oct 5 09:17:54 2013 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 5 Oct 2013 09:17:54 +0200 (CEST) Subject: [CHNOSZ-commits] r57 - in pkg/CHNOSZ: . R inst inst/extdata/refseq man Message-ID: <20131005071754.9F80A18426B@r-forge.r-project.org> Author: jedick Date: 2013-10-05 09:17:54 +0200 (Sat, 05 Oct 2013) New Revision: 57 Modified: pkg/CHNOSZ/DESCRIPTION pkg/CHNOSZ/R/util.fasta.R pkg/CHNOSZ/inst/NEWS pkg/CHNOSZ/inst/extdata/refseq/README.txt pkg/CHNOSZ/inst/extdata/refseq/gencat.sh pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R pkg/CHNOSZ/man/extdata.Rd pkg/CHNOSZ/man/util.fasta.Rd pkg/CHNOSZ/man/util.formula.Rd Log: update extdata/refseq to RefSeq 61 Modified: pkg/CHNOSZ/DESCRIPTION =================================================================== --- pkg/CHNOSZ/DESCRIPTION 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/DESCRIPTION 2013-10-05 07:17:54 UTC (rev 57) @@ -1,6 +1,6 @@ -Date: 2013-09-22 +Date: 2013-10-05 Package: CHNOSZ -Version: 1.0.1-1 +Version: 1.0.1-2 Title: Chemical Thermodynamics and Activity Diagrams Author: Jeffrey M. Dick Maintainer: Jeffrey Dick Modified: pkg/CHNOSZ/R/util.fasta.R =================================================================== --- pkg/CHNOSZ/R/util.fasta.R 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/R/util.fasta.R 2013-10-05 07:17:54 UTC (rev 57) @@ -45,16 +45,18 @@ } read.fasta <- function(file, i=NULL, ret="count", lines=NULL, ihead=NULL, - start=NULL, stop=NULL, type="protein") { + start=NULL, stop=NULL, type="protein", id=NULL) { # read sequences from a fasta file # some of the following code was adapted from # read.fasta in package seqinR # value of 'i' is what sequences to read # value of 'ret' determines format of return value: - # count: amino acid composition (same columns as thermo$protein, can be used by add.protein) + # count: amino acid composition (same columns as thermo$protein, can be used by add.protein) # or nucleic acid base composition (A-C-G-T) - # seq: amino acid sequence - # fas: fasta entry + # seq: amino acid sequence + # fas: fasta entry + # value of 'id' is used for 'protein' in output table, + # otherwise ID is parsed from FASTA header (can take a while) is.nix <- Sys.info()[[1]]=="Linux" if(is.nix & is.null(lines)) { msgout("read.fasta: reading ",basename(file),"\n") @@ -106,7 +108,7 @@ organism <- bnf # protein/gene name is from header line for entry # (strip the ">" and go to the first space or underscore) - id <- as.character(palply(1:length(i), function(j) { + if(is.null(id)) id <- as.character(palply(1:length(i), function(j) { # get the text of the line f1 <- linefun(i[j],i[j]) # stop if the first character is not ">" Modified: pkg/CHNOSZ/inst/NEWS =================================================================== --- pkg/CHNOSZ/inst/NEWS 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/inst/NEWS 2013-10-05 07:17:54 UTC (rev 57) @@ -1,4 +1,4 @@ -CHANGES IN CHNOSZ 1.0.1-1 (2013-09-22) +CHANGES IN CHNOSZ 1.0.1-2 (2013-10-05) -------------------------------------- - Updated extdata/protein/Sce.csv.xz using Saccharomyces Genome Database @@ -6,6 +6,14 @@ - more.aa() includes SGDID and gene name in the abbrv and ref columns. +- In extdata/refseq, scripts and data files were updated for NCBI + Reference Sequence (RefSeq) release 61 (2013-09-09). Code was adapted + to deal with WP multispecies accessions. + +- read.fasta() gets new argument 'id'; when supplied, it skips reading + the protein names from the FASTA headers. + + CHANGES IN CHNOSZ 1.0.1 (2013-07-04) ------------------------------------ Modified: pkg/CHNOSZ/inst/extdata/refseq/README.txt =================================================================== --- pkg/CHNOSZ/inst/extdata/refseq/README.txt 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/inst/extdata/refseq/README.txt 2013-10-05 07:17:54 UTC (rev 57) @@ -1,50 +1,47 @@ # the following data files support calculations using the -# RefSeq database (release 57, 2013-01-08) +# RefSeq database (release 61, 2013-09-09) protein_refseq.csv: overall (average) amino acid composition of all proteins for each - microbial genome in the RefSeq collection (n=7415) -taxid_names.csv: taxid, phylum name and species name for 7415 microbial taxa + microbial genome in the RefSeq collection (n=6758) +taxid_names.csv: taxid, phylum name and species name for 6758 microbial taxa # these functions/scripts have the following purpose (output files listed in parentheses): -mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq57.faa) gencat.sh - extract gi number, taxid, sequence length from RefSeq release catalog (gi.taxid.txt) protein.refseq.R - get average amino acid composition for each taxid from gzipped sequence files (protein_refseq.csv) taxid.names.R - get taxonomic names for each taxid represented (taxid_names.csv) +mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq61.faa) # bash scripts assume a GNU/Linux-like operating system -# timings were made for processing RefSeq 57 on a recent (2009) intel laptop +# timings were made for processing RefSeq 61 on a recent (2009) intel laptop # download stuff -1. download 'release57.files.installed' and 'RefSeq-release57.catalog.gz' from NCBI +1. download 'release61.files.installed', 'RefSeq-release61.catalog.gz', + and 'release61.multispecies_WP_accession_to_taxname.gz' from NCBI (ftp://ftp.ncbi.nih.gov/refseq/release/release-catalog) 2. list URLS for the microbial protein sequence files: - grep microbial.*.protein.faa* release57.files.installed | \ + grep microbial.*.protein.faa* release61.files.installed | \ sed -e "s/^/ftp\:\/\/ftp.ncbi.nih.gov\/refseq\/release\/microbial\//g" > urllist -3. download the files using 'wget -i urllist' [3227 files, 4.6 GB] +3. download the files using 'wget -i urllist' [232 files, 5.5 GB] 4. move the .gz files to a directory named 'protein' # protein stuff -5. gzip -d RefSeq-release57.catalog.gz [3.1 GB] -6. use 'gencat.sh' to generate gi.taxid.txt for microbial proteins in the catalog [7 minutes] - for RefSeq57, 'cat gi.taxid.txt | wc -l' is 24488527 -7. generate protein_refseq.csv in R: [~19 hours] +5. gzip -d RefSeq-release61.catalog.gz [3.5 GB] +6. use 'gencat.sh' to generate gi.taxid.txt for microbial proteins in the catalog [3 minutes] + for RefSeq61, 'cat gi.taxid.txt | wc -l' is 28548891 +7. generate protein_refseq_complete.csv in R: [~14 hours] > source("protein.refseq.R") > protein.refseq() note that this depends on gi.taxid.txt and the .faa.gz files in the 'protein' directory -8. trim entries from protein_refseq.csv (smaller size, better for package distribution) +8. trim entries to produce protein_refseq.csv (smaller size, better for package distribution) > source("trim_refseq.R") # taxonomy stuff 9. edit 'taxid.names.R' so that 'taxdir' points to the directory where the files 'names.dmp' and 'nodes.dmp' are present. these files can be downloaded from - ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2013-01-15) -10. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~4.5 hours] + ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz (accessed on 2013-09-18) +10. source 'taxid.names.R' to generate the file 'taxid_names.csv' [~5.5 hours] # BLAST stuff (optional) 11. run ls protein/*.gz > filelist -12. use 'mkfaa.sh' to combine the sequences into a single file 'refseq57.faa' [9.3 GB, 11 minutes] - for RefSeq57, 'grep "^>" refseq57.faa | wc -l' is 24477649 - the difference from the catalog (step 6 above), is 10878 sequences: - taxid 1211777 (Rhizobium mesoamericanum STM3625) (6356 sequences) - taxid 313627 (Bacillus sp. NRRL B-14911) (4522 sequences) -13. make a BLAST database, e.g. formatdb -t refseq57 -i refseq57.faa -p T - +12. use 'mkfaa.sh' to combine the sequences into a single file 'refseq61.faa' [11 GB, 11 minutes] + for RefSeq61, 'grep "^>" refseq61.faa | wc -l' is 28548891 (same as catalog) +13. make a BLAST database, e.g. formatdb -t refseq61 -i refseq61.faa -p T Modified: pkg/CHNOSZ/inst/extdata/refseq/gencat.sh =================================================================== --- pkg/CHNOSZ/inst/extdata/refseq/gencat.sh 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/inst/extdata/refseq/gencat.sh 2013-10-05 07:17:54 UTC (rev 57) @@ -1,27 +1,19 @@ #/bin/sh # extract microbial, genomic records from the RefSeq catalog -RELEASE=57 -ORG=microbial -MOL=protein -BASENAME=RefSeq-release$RELEASE.catalog +RELEASE=61 +INFILE=RefSeq-release$RELEASE.catalog +OUTFILE=RefSeq-release$RELEASE.catalog.microbial.protein -# extract the microbial records -grep \|$ORG $BASENAME > $BASENAME.$ORG +# extract the microbial protein records +grep "[[:space:]]NP_.*microbial" $INFILE > $OUTFILE # 449849 records +grep "[[:space:]]YP_.*microbial" $INFILE >> $OUTFILE # 7898382 records +grep "[[:space:]]WP_.*microbial" $INFILE >> $OUTFILE # 20200660 records -# extract the protein records -# alternatively, could use egrep: -#egrep "[[:space:]]AP_ | [[:space:]]NP_ | [[:space:]]XP_ | \ -# [[:space:]]YP_ | [[:space:]]ZP_" $BASENAME.$ORG > $BASENAME.$ORG.$MOL -grep "[[:space:]]AP_" $BASENAME.$ORG > $BASENAME.$ORG.$MOL # 0 records -grep "[[:space:]]NP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL # 450218 records -grep "[[:space:]]XP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL # 0 records -grep "[[:space:]]YP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL # 6786156 records -grep "[[:space:]]ZP_" $BASENAME.$ORG >> $BASENAME.$ORG.$MOL # 17252153 records - -# to save only the gi, taxid and sequence length columns, in that order +# to save only the gi and taxid columns # the field separator (tab) is defined in command line, not in awk program, # otherwise the first line gets processed incorrectly -cat $BASENAME.$ORG.$MOL | awk -F\\t '{print $4,$1,$7}' > gi.taxid.unsrt +cat $OUTFILE | awk -F\\t '{print $4,$1}' > gi.taxid.unsrt # sort the file on gi so that it can be used with e.g. the unix 'join' command +# (note: for join to work, -n for numeric sort is *not* used here) cat gi.taxid.unsrt | sort > gi.taxid.txt Modified: pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh =================================================================== --- pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/inst/extdata/refseq/mkfaa.sh 2013-10-05 07:17:54 UTC (rev 57) @@ -1,6 +1,6 @@ # send the contents of all the .faa.gz files to a single file ("OUTFILE") -OUTFILE="refseq57.faa" +OUTFILE="refseq61.faa" FILELIST="filelist" # start with an empty file Modified: pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R =================================================================== --- pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/inst/extdata/refseq/protein.refseq.R 2013-10-05 07:17:54 UTC (rev 57) @@ -1,129 +1,126 @@ # protein.refseq.R -# calculate the average amino acid -# composition of proteins for each taxid -# 20100704 jmd +# calculate the overall amino acid composition of proteins for each taxid in RefSeq +# 20100704 first version +# 20130922 deal with WP multispecies accessions (RefSeq61) -# we need CHNOSZ for read.fasta +# uses system "join" command +# also, CHNOSZ for read.fasta, def2gi, etc. require(CHNOSZ) # where the microbial*.protein.faa.gz files are kept faadir <- "protein" -# read gi.taxid if it already doesn't exist -if(!exists("gi.taxid")) { - cat("reading gi.taxid.txt ... could take a while!\n") - gi.taxid <- read.table("gi.taxid.txt") - # we should make sure that all 3 columns are numeric (not factors) +# read WP multispecies table if it doesn't already exist +if(!exists("WP")) { + cat("reading WP multispecies table\n") + WP <- read.table(dir(pattern="multispecies_WP_accession_to_taxname.txt.gz"), sep="\t") } protein.refseq <- function(n=NULL) { # the list of sequence files (microbial44.protein.fa.gz etc) files <- dir(faadir) # loop over each one, getting the contents - if(is.null(n)) n <- length(files) + if(is.null(n)) n <- 1:length(files) out <- NULL - for(i in 1:n) { - cat(paste("(",i,"of",n,") ")) - # check if it's a fasta file - if(length(grep(".protein.faa.gz$",files[i]))==0) { - cat(paste("skipping \'",files[i],"\' ...\n")) + for(i in n) { + cat(paste("(", i, " of ", length(n), ") ", sep="")) + # check if the name looks like a fasta file + if(length(grep(".protein.faa.gz$", files[i]))==0) { + cat(paste("skipping", files[i], "...\n")) next } else { - cat(paste("reading \'",files[i],"\' ... ")) + cat(paste("reading", files[i], "... ")) } - # read the fasta file - fa <- readLines(paste(faadir,files[i],sep="/")) - # get the gi's for this file - ihead.orig <- ihead <- grep("^>",fa) - cat(paste("file has",length(ihead),"sequences\n")) + # read the file + fa <- readLines(paste(faadir, files[i], sep="/")) + # get the gi's from this file + ihead <- grep("^>", fa) + cat(paste("file has", length(ihead), "sequences\n")) + # leave the gi's as character to sort lexigraphically (for use with system join command) gi <- def2gi(fa[ihead]) # now find the taxids that belong to these gi's cat("finding matching taxids in catalog ...") - gi.order <- order(as.character(gi)) - gi <- gi[gi.order] - ihead <- ihead[gi.order] - write.table(gi,"gi.txt",row.names=FALSE,col.names=FALSE,quote=FALSE) - system("join -1 1 -2 1 gi.txt gi.taxid.txt > gi.taxid.match") - gi.taxid.match <- read.table("gi.taxid.match",header=FALSE) - tax <- gi.taxid.match$V2 - utax <- unique(tax) - cat(paste(" found",length(utax),"unique taxa\n")) + write.table(sort(gi), "gi.txt", row.names=FALSE, col.names=FALSE, quote=FALSE) + system("join -j 1 gi.txt gi.taxid.txt > gi.taxid.match") + gi.taxid <- read.table("gi.taxid.match", header=FALSE) + # produce an expanded table, replacing multispecies WP_ records with all corresponding taxids + is.multi <- gi.taxid$V1 %in% WP$V2 + is.in.gi <- WP$V2 %in% gi.taxid$V1 + # take out generic records + gi.taxid <- gi.taxid[!is.multi, ] + # add specific records + gi.taxid.WP <- WP[is.in.gi, ] + colnames(gi.taxid.WP) <- c("V0", "V1", "V2", "V3") + gi.taxid <- rbind(gi.taxid, gi.taxid.WP[, c(2, 3)]) + # identify unique taxids + utax <- unique(gi.taxid$V2) + cat(paste(" found", length(utax), "unique taxa\n")) # now loop over each taxon for(j in 1:length(utax)) { - jtax <- which(tax==utax[j]) - nseq.read <- length(jtax) - cat(paste("taxon",utax[j],"has",nseq.read,"sequences")) - # count the number of sequences in the catalog - jjtax <- which(gi.taxid$V2==utax[j]) - nseq.cat <- length(jjtax) - if(nseq.cat==nseq.read) cat(" (same as catalog)") - else cat(paste(" WARNING: catlog has",nseq.cat)) + # report on the taxon name and number of sequences + jtax <- which(gi.taxid$V2==utax[j]) + nseq <- length(jtax) + cat(paste("taxon", utax[j], "has", nseq, "sequences ")) + # match the gi numbers to their position in the fasta file + igi <- match(gi.taxid$V1[jtax], as.numeric(gi)) # get the amino acid compositions of the sequences - aa <- read.fasta(file="",i=ihead[jtax],lines=fa,ihead=ihead.orig) - naa.read <- sum(aa[,6:25]) - # count the number of amino acids in the catalog - naa.cat <- sum(gi.taxid$V3[jjtax]) - if(naa.cat==naa.read) cat(paste("... all of",naa.cat,"amino acids ")) - else cat(paste("...",naa.read,"of",naa.cat,"amino acids ")) + # use suppressMessages to hide mesages about unrecognized amino acid codes + # set id="refseq" to avoid parsing IDs from fasta header lines + aa <- suppressMessages(read.fasta(file="", i=ihead[igi], lines=fa, ihead=ihead, id="refseq")) + # the number of amino acids read from file + naa <- sum(aa[, 6:25]) # prepare the dataframe # average amino acid composition - aa[1,6:25] <- colSums(aa[,6:25])/nrow(aa) - aa <- aa[1,] - aa$protein <- "refseq" + aa[1, 6:25] <- colSums(aa[, 6:25])/nrow(aa) + aa <- aa[1, ] aa$organism <- utax[j] aa$chains <- 1 - # the number of amino acids read from files - aa$abbrv <- naa.read - # what is the name of this organism - itax1 <- match(utax[j],gi.taxid.match$V2) - gi1 <- gi.taxid.match$V1[itax1] - igi1 <- match(gi1,gi) - iihead <- ihead[igi1] - orgname <- s2c(fa[iihead],sep=" [",keep.sep=TRUE)[2] - orgname <- substr(orgname,2,nchar(orgname)) - cat(paste(orgname,"\n")) - # numbers of sequences, amino acids, organism name - aa <- cbind(aa,data.frame(nseq=nseq.read,naa=naa.read,nseq.cat=nseq.cat,naa.cat=naa.cat,orgname=orgname)) - # keep track of source file name - # (append number of sequences according to the catalog) - aa$ref <- paste(sub(".protein.faa.gz","",files[i]),"(",nseq.read,",",naa.read,")",sep="") - if(j==1) aa.out <- aa else aa.out <- rbind(aa.out,aa) + aa$abbrv <- naa + # what is the name of this organism: look first at multispecies WP, then in gi header + jtax.WP <- match(utax[j], gi.taxid.WP$V2) + if(!is.na(jtax.WP)) orgname <- paste("[", gi.taxid.WP$V3[jtax.WP], "]", sep="") + else { + orgname <- s2c(fa[ihead[igi[1]]], sep=" [", keep.sep=TRUE)[2] + orgname <- substr(orgname, 2, nchar(orgname)) + } + cat(paste(orgname, "\n")) + # append columns for numbers of sequences, organism name + aa <- cbind(aa, data.frame(nseq=nseq, orgname=orgname)) + # keep track of source file name, and number of sequences and amino acids + fname <- sub("microbial.nonredundant_protein", "nonredundant", sub(".protein.faa.gz", "", files[i])) + aa$ref <- paste(fname, "(", nseq, ",", naa, ")", sep="") + if(j==1) aa.out <- aa else aa.out <- rbind(aa.out, aa) } - if(is.null(out)) out <- aa.out else out <- rbind(out,aa.out) + if(is.null(out)) out <- aa.out else out <- rbind(out, aa.out) } - # now we have to combine taxids that showed up more than once - # e.g. microbial10;microbial9(2231743) + + # now we have to combine taxids that showed up more than once (in different files) duptax <- unique(out$organism[duplicated(out$organism)]) if(length(duptax) > 0) { for(i in 1:length(duptax)) { id <- which(out$organism==duptax[i]) # add them together, weighted by numbers of sequences - aa <- colSums(out$nseq[id]*out[id,6:25])/sum(out$nseq[id]) + aa <- colSums(out$nseq[id]*out[id, 6:25]) / sum(out$nseq[id]) # keep the result in the first row - out[id[1],6:25] <- aa + out[id[1], 6:25] <- aa # total number of sequences found in the files out$nseq[id[1]] <- sum(out$nseq[id]) # total number of amino acids found in the sequence files - out$naa[id[1]] <- sum(out$naa[id]) - out$abbrv[id[1]] <- out$naa[id[1]] + out$abbrv[id[1]] <- sum(out$abbrv[id]) # write down the file names - out$ref[id[1]] <- c2s(out$ref[id],sep=";") - # drop the duplicated ones - out <- out[-id[2:length(id)],] + out$ref[id[1]] <- c2s(out$ref[id], sep=";") + # drop the rows after the first + out <- out[-id[2:length(id)], ] } } - # summarize the organism name and numbers of - # sequences/amino acids read vs. the catalog + # append the organism name to the numbers of sequences and amino acids read for(i in 1:nrow(out)) { - dnseq <- out$nseq[i] - out$nseq.cat[i] - dnaa <- out$naa[i] - out$naa.cat[i] - if(dnseq != 0 | dnaa != 0) out$ref[i] <- paste("[",dnseq,",",dnaa,"]",out$ref[i],sep="") - out$ref[i] <- paste(out$ref[i],out$orgname[i],sep="") + out$ref[i] <- paste(out$ref[i], out$orgname[i], sep="") } # we can drop the extra columns now - out <- out[,1:25] + out <- out[, 1:25] # and do a little rounding - out[,6:25] <- round(out[,6:25],3) - write.csv(out,"protein_refseq.csv",row.names=FALSE) + out[, 6:25] <- round(out[, 6:25], 3) + write.csv(out, "protein_refseq_complete.csv", row.names=FALSE) return(out) } Modified: pkg/CHNOSZ/inst/extdata/refseq/protein_refseq.csv.xz =================================================================== (Binary files differ) Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R =================================================================== --- pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/inst/extdata/refseq/taxid.names.R 2013-10-05 07:17:54 UTC (rev 57) @@ -2,11 +2,14 @@ # of species,genus,family,order,class,phylum,superkingdom # for each of the microbial taxa in RefSeq database +# uses functions in CHNOSZ to process taxonomy files +require(CHNOSZ) + # change this to the location where names.dmp and nodes.dmp are located taxdir <- "./taxdump" # get the taxids from protein_refseq.csv -pr <- read.csv("protein_refseq.csv.xz") +pr <- read.csv("protein_refseq.csv") taxid <- pr$organism # read in the names and nodes @@ -20,7 +23,7 @@ } # what ranks we want to get -ranks <- c("species","genus","family","order","class","phylum","superkingdom") +ranks <- c("species", "genus", "family", "order", "class", "phylum", "superkingdom") # start with an empty list out <- rep(list(character()), length(ranks)) @@ -43,14 +46,14 @@ # get names of these parents pnames <- sciname(pids[ip], taxdir, names=taxnames) # add results to output list - for(j in 1:length(ranks)) out[[j]] <- c(out[[j]],pnames[j]) + for(j in 1:length(ranks)) out[[j]] <- c(out[[j]], pnames[j]) # report progress - if(i %% 50 == 0) cat(paste(i,".. ")) + if(i %% 50 == 0) cat(paste(i, ".. ")) } # finish progress report cat("done!\n") # write results to a file out <- as.data.frame(out) -out <- cbind(data.frame(taxid=taxid[ii],out)) -write.csv(out,"taxid_names.csv",row.names=FALSE,quote=FALSE) +out <- cbind(data.frame(taxid=taxid[ii], out)) +write.csv(out, "taxid_names.csv", row.names=FALSE, quote=FALSE) Modified: pkg/CHNOSZ/inst/extdata/refseq/taxid_names.csv.xz =================================================================== (Binary files differ) Modified: pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R =================================================================== --- pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/inst/extdata/refseq/trim_refseq.R 2013-10-05 07:17:54 UTC (rev 57) @@ -1,66 +1,15 @@ -# trim the protein_refseq.csv, removing some entries with highly-represented names +# trim the protein_refseq.csv, +# keeping only organisms used in an example in ?protein.info # (to keep the file size down for CHNOSZ package) -# 20130320 jmd -# the original file (7415 rows) -pr <- read.csv("protein_refseq.csv") -# the common names (comments show number in RefSeq 57 -names <- c( - "Escherichia coli", # 662 - "Streptococcus", # 432 - "Salmonella", # 299 - "Staphylococcus", # 290 - "Enterococcus", # 218 - "Vibrio", # 190 - "Lactobacillus", # 179 - "Helicobacter", # 164 - "Pseudomonas", # 155 - "Mycobacterium", # 138 - "Campylobacter", # 131 - "Neisseria", # 121 - "Clostridium", # 118 - "Yersinia", # 111 - "Bacillus cereus", # 105 - "Acinetobacter", # 103 - "Propionibacterium", # 93 - "Burkholderia", # 91 - "Candidatus", # 83 - "Bacteroides", # 80 - "Mycoplasma", # 73 - "Streptomyces", # 72 - "Corynebacterium", # 70 - "Listeria", # 60 - "Leptospira", # 57 - "Klebsiella", # 57 - "Bifidobacterium", # 55 - "Brucella", # 53 - "Shigella", # 50 - "Haemophilus", # 47 - "Rickettsia", # 46 - "Prevotella", # 44 - "Chlamydia", # 42 - "Francisella", # 37 - "Bacillus thuringiensis", # 36 - "Borrelia", # 34 - "Fusobacterium", # 33 - "Xanthomonas", # 31 - "Rhizobium", # 27 - "Bartonella", # 26 - "Pseudoalteromonas", # 24 - "Bacillus anthracis", # 23 - "Actinomyces", # 23 - "Treponema", # 21 - "Actinobacillus", # 20 - "Aggregatibacter", # 20 - "Gardnerella" # 19 -) - -# loop over the names, identify rows, leave the first row -for(name in names) { - iname <- grep(name, pr$ref) - iname <- tail(iname, -1) - pr <- pr[-iname, ] -} - -# we're left with 2600 rows +# the original file +pr <- read.csv("protein_refseq_complete.csv") +# the terms we'll keep +terms <- c("Natr", "Halo", "Rhodo", "Acido", "Methylo", + "Nitro", "Desulfo", "Chloro", "Geo", "Methano", + "Thermo", "Pyro", "Sulfo", "Buchner") +# identify rows to keep +iterm <- unique(unlist(sapply(terms, grep, pr$ref))) +# extract data frame, write new file +pr <- pr[iterm, ] write.csv(pr, "protein_refseq.csv", row.names=FALSE) Modified: pkg/CHNOSZ/man/extdata.Rd =================================================================== --- pkg/CHNOSZ/man/extdata.Rd 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/man/extdata.Rd 2013-10-05 07:17:54 UTC (rev 57) @@ -57,17 +57,20 @@ } - Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, using RefSeq release 57 of 2013-01-08: + Files in \code{refseq} contain code and results of processing NCBI Reference Sequences (RefSeq) for microbial proteins, using RefSeq release 61 of 2013-09-09: \itemize{ \item \code{README.txt} Instructions for producing the data files. \item \code{gencat.sh} Bash script to extract microbial protein records from the RefSeq catalog. - \item \code{gi.taxid.txt} Output from above. The complete file is too large to distribute with CHNOSZ, but a portion is included in \code{extdata/bison} to support processing example BLAST files for the Bison Pool metagenome. + \item \code{gi.taxid.txt} Output from above. The complete file is too large to distribute with CHNOSZ, but a portion is included in \code{extdata/bison} to support processing example BLAST files for the Bison Pool metagenome (based on RefSeq 57, 2013-01-08). \item \code{mkfaa.sh} Combine the contents of .faa.gz files into a single FASTA file (to use e.g. for making a BLAST database). \item \code{protein.refseq.R} Calculate average amino acid composition of all proteins for each organism identified by a taxonomic ID. - \item \code{trim_refseq.R} Remove some entries with commonly occurring names (e.g. many different strains of Escherichia coli) to reduce size of \code{protein_refseq.csv} (to keep package size down). - \item \code{protein_refseq.csv.xz} Output from above. See example for \code{\link{ZC}}. + \item \code{trim_refseq.R} Keep only selected organism names (reduces number of taxa from 6758 to 779, helps to control package size). + \item \code{protein_refseq.csv.xz} Output from above. See example in \code{\link{protein.info}}. \item \code{taxid.names.R} Generate a table of scientific names for the provided taxids. Requires the complete \code{names.dmp} and \code{nodes.dmp} from NCBI taxonomy files. - \item \code{taxid_names.csv.xz} Output from above. See example for \code{\link{id.blast}}. + \item \code{taxid_names.csv.xz} Output from above. + NOTE: For backward compatibility with the example BLAST files for the Bison Pool metagenome, the packaged file merges records for taxids found in either RefSeq 57 or 61. + Certain taxids in release 57 were not located in the current RefSeq catalog, probably related to the transition to the \dQuote{WP} multispecies accessions (\url{ftp://ftp.ncbi.nlm.nih.gov/refseq/release/announcements/WP-proteins-06.10.2013.pdf}). + See example for \code{\link{id.blast}}. } Files in \code{taxonomy} contain example taxonomic data files: Modified: pkg/CHNOSZ/man/util.fasta.Rd =================================================================== --- pkg/CHNOSZ/man/util.fasta.Rd 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/man/util.fasta.Rd 2013-10-05 07:17:54 UTC (rev 57) @@ -14,7 +14,7 @@ grep.file(file, pattern = "", y = NULL, ignore.case = TRUE, startswith = ">", lines = NULL, grep = "grep") read.fasta(file, i = NULL, ret = "count", lines = NULL, - ihead = NULL, start=NULL, stop=NULL, type="protein") + ihead = NULL, start=NULL, stop=NULL, type="protein", id = NULL) count.aa(seq, start=NULL, stop=NULL, type="protein") uniprot.aa(protein, start=NULL, stop=NULL) } @@ -26,13 +26,14 @@ \item{ignore.case}{logical, ignore differences between upper- and lower-case?} \item{startswith}{character, only lines starting with this expression are matched} \item{lines}{list of character, supply the lines here instead of reading them from file} - \item{grep}{character, name of system \code{grep} command} + \item{grep}{character, name of system \samp{grep} command} \item{i}{numeric, line numbers of sequence headers to read} \item{ret}{character, specification for type of return (count, sequence, or FASTA format)} \item{ihead}{numeric, which lines are headers} \item{start}{numeric, position in sequence to start counting} \item{stop}{numeric, position in sequence to stop counting} \item{type}{character, sequence type (protein or DNA)} + \item{id}{character, value to be used for \code{protein} in output table} \item{seq}{character, amino acid sequence of a protein} \item{protein}{character, entry name for protein in UniProt} } @@ -48,10 +49,12 @@ \code{read.fasta} is used to retrieve entries from a FASTA file. To read only selected sequences pass the line numbers of the header lines to the function in \code{i} (they can be identified using e.g. \code{grep.file}). -The function returns various formats depending on the value of \code{ret}; the default \samp{count} returns a dataframe of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences). +The function returns various formats depending on the value of \code{ret}. +The default \samp{count} returns a data frame of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences). Similarly to \code{grep.file}, this function utilizes the OS's \samp{grep} on supported operating systems in order to identify the header lines as well as \samp{cat} to read the file, otherwise \code{\link{readLines}} and \R's \code{\link{substr}} are used to read the file and locate the header lines. If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}. Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to ""). +When \code{ret} is \samp{count}, the names of the proteins in the resulting data frame are parsed from the header lines of the file, unless \code{id} is provided. \code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}). For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}. Modified: pkg/CHNOSZ/man/util.formula.Rd =================================================================== --- pkg/CHNOSZ/man/util.formula.Rd 2013-09-22 10:22:25 UTC (rev 56) +++ pkg/CHNOSZ/man/util.formula.Rd 2013-10-05 07:17:54 UTC (rev 57) @@ -60,7 +60,9 @@ } \seealso{ - \code{\link{makeup}}, used by \code{mass} and \code{entropy}, and \code{ZC} and \code{i2A} for counting the elements in a formula (the latter two make use of the \code{count.zero} argument). \code{\link{group.formulas}} (and by extension \code{\link{protein.formula}}) use the stoichiometric matrices created by \code{i2A}, as does \code{\link{run.wjd}} + \code{\link{makeup}}, used by \code{mass} and \code{entropy}, and \code{ZC} and \code{i2A} for counting the elements in a formula (the latter two make use of the \code{count.zero} argument). + \code{\link{group.formulas}} (and by extension \code{\link{protein.formula}}) use the stoichiometric matrices created by \code{i2A}, as does \code{\link{run.wjd}}. + \code{\link{protein.formula}} has an example of computing ZC for proteins compiled from the RefSeq database. } \examples{\dontshow{data(thermo)}