[CHNOSZ-commits] r699 - in pkg/CHNOSZ: . R inst inst/extdata man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Mon Feb 7 05:38:30 CET 2022


Author: jedick
Date: 2022-02-07 05:38:29 +0100 (Mon, 07 Feb 2022)
New Revision: 699

Removed:
   pkg/CHNOSZ/R/util.blast.R
   pkg/CHNOSZ/inst/extdata/bison/
   pkg/CHNOSZ/man/util.blast.Rd
Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/NAMESPACE
   pkg/CHNOSZ/R/examples.R
   pkg/CHNOSZ/inst/NEWS.Rd
   pkg/CHNOSZ/inst/TODO
   pkg/CHNOSZ/man/CHNOSZ-package.Rd
   pkg/CHNOSZ/man/extdata.Rd
Log:
Remove functions in util.blast.R and example files in extdata/bison


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================
--- pkg/CHNOSZ/DESCRIPTION	2022-02-07 03:46:08 UTC (rev 698)
+++ pkg/CHNOSZ/DESCRIPTION	2022-02-07 04:38:29 UTC (rev 699)
@@ -1,6 +1,6 @@
 Date: 2022-02-07
 Package: CHNOSZ
-Version: 1.4.1-24
+Version: 1.4.1-25
 Title: Thermodynamic Calculations and Diagrams for Geochemistry
 Authors at R: c(
     person("Jeffrey", "Dick", , "j3ffdick at gmail.com", role = c("aut", "cre"),

Modified: pkg/CHNOSZ/NAMESPACE
===================================================================
--- pkg/CHNOSZ/NAMESPACE	2022-02-07 03:46:08 UTC (rev 698)
+++ pkg/CHNOSZ/NAMESPACE	2022-02-07 04:38:29 UTC (rev 699)
@@ -15,7 +15,6 @@
 # examples
   "examples", "demos", "mtitle",
   "list2array", "slice", "dimSums", "slice.affinity",
-  "def2gi", "read.blast", "id.blast",
   "add.OBIGT", "RH2OBIGT",
   "expr.property", "expr.units",
   "mass", "entropy", "GHS", "water",
@@ -41,7 +40,7 @@
   "count.elements",
 # (no other functions are used in the tests)
 # other exported functions that are not used above
-  "write.blast", "checkEOS", "checkGHS", "check.OBIGT",
+  "checkEOS", "checkGHS", "check.OBIGT",
   "V_s_var", "Cp_s_var",
   "DGinf", "SD", "pearson", "shannon", "CV", "logact",
   "EOSlab", "EOScalc",

Modified: pkg/CHNOSZ/R/examples.R
===================================================================
--- pkg/CHNOSZ/R/examples.R	2022-02-07 03:46:08 UTC (rev 698)
+++ pkg/CHNOSZ/R/examples.R	2022-02-07 04:38:29 UTC (rev 699)
@@ -6,7 +6,7 @@
   # run all the examples in CHNOSZ documentation
   .ptime <- proc.time()
   topics <- c("thermo", "examples",
-    "util.array", "util.blast", "util.data", "util.expression", "util.legend", "util.plot",
+    "util.array", "util.data", "util.expression", "util.legend", "util.plot",
     "util.fasta", "util.formula", "util.misc", "util.seq", "util.units",
     "util.water", "taxonomy", "info", "retrieve", "add.OBIGT", "protein.info",
     "hkf", "water", "IAPWS95", "subcrt", "Berman",

Deleted: pkg/CHNOSZ/R/util.blast.R
===================================================================
--- pkg/CHNOSZ/R/util.blast.R	2022-02-07 03:46:08 UTC (rev 698)
+++ pkg/CHNOSZ/R/util.blast.R	2022-02-07 04:38:29 UTC (rev 699)
@@ -1,163 +0,0 @@
-# CHNOSZ/blast.R
-# functions to analyze BLAST output files
-# 20100320 jmd
-
-## read a BLAST tabular output file, and filter by similarity,
-## E-value and max hits per query
-read.blast <- function(file, similarity=30, evalue=1e-5, max.hits=1, min.length=NA, quiet=FALSE) {
-  # display some information about the file
-  if(inherits(file, "connection")) fname <- summary(file)$description
-  else fname <- basename(file)
-  cat(paste("read.blast: reading", fname, "\n"))
-  # read the blast tabular file
-  blast <- read.csv(file,header=FALSE,sep="\t",stringsAsFactors=FALSE)
-  if(!quiet) cat(paste("  read",nrow(blast),"lines with",length(unique(blast$V1)),"query sequences\n"))
-  # take out rows that don't meet our desired similarity
-  if(!is.na(similarity)) {
-    is <- which(blast$V3 >= similarity)
-    blast <- blast[is,]
-    if(!quiet) cat(paste("  similarity filtering leaves",length(is),"lines and",length(unique(blast$V1)),"query sequences\n"))
-  }
-  # take out rows that don't meet our desired e-value
-  if(!is.na(evalue)) {
-    ie <- which(blast$V11 <= evalue)
-    blast <- blast[ie,]
-    if(!quiet) cat(paste("  evalue filtering leaves",length(ie),"lines and",length(unique(blast$V1)),"query sequences\n"))
-  }
-  # take out rows that don't have the minimum alignment length
-  if(!is.na(min.length)) {
-    ie <- which(blast$V4 >= min.length)
-    blast <- blast[ie,]
-    if(!quiet) cat(paste("  alignment length filtering leaves",length(ie),"lines and",length(unique(blast$V1)),"query sequences\n"))
-  }
-  # now take only max hits for each query sequence
-  if(!is.na(max.hits)) {
-    query.shift <- query <- blast$V1
-    lq <- length(query)
-    # for short (i.e., 1 query sequence) files, make sure that the hits get counted
-    query.shift[max((lq-max.hits+1),1):lq] <- -1
-    #query.shift <- query.shift[c((max.hits+1):lq,1:max.hits)]
-    query.shift <- query.shift[c((lq-max.hits+1):lq,1:(lq-max.hits))]
-    ib <- which(query!=query.shift)
-    blast <- blast[ib,]
-    if(!quiet) cat(paste("  max hits filtering leaves",length(ib),"lines and",length(unique(blast$V1)),"query sequences\n"))
-  }
-  # assign meaningful column names
-  # http://bergmanlab.smith.man.ac.uk/?p=41, accessed 20111223
-  colnames(blast) <- c("queryId", "subjectId", "percIdentity", "alnLength", "mismatchCount", 
-    "gapOpenCount", "queryStart", "queryEnd", "subjectStart", "subjectEnd", "eVal", "bitScore")
-  return(blast)
-}
-
-## process a blast table, identify the taxon
-## for each hit
-id.blast <- function(blast, gi.taxid, taxid.names, min.taxon=0, 
-  min.query=0, min.phylum=0, take.first=TRUE) {
-  # what are gi numbers of the hits
-  # we use def2gi to extract just the gi numbers
-  gi <- def2gi(blast[,2])
-  # what taxid do they hit
-  cat("id.blast: getting taxids ... ")
-  imatch <- match(gi,gi.taxid[[1]])
-  taxid <- gi.taxid[[2]][imatch]
-  # what phyla are these
-  cat("getting taxid.names ... ")
-  iphy <- match(taxid,taxid.names$taxid)
-  phylum <- taxid.names$phylum[iphy]
-  species <- taxid.names$species[iphy]
-  cat(paste(length(unique(phylum)),"unique phyla,",length(unique(taxid)),"unique taxa\n"))
-  # now expand phyla into our blast table
-  # we really don't want our stringsAsFactors (to use NAphylum, below)
-  tax.out <- data.frame(taxid=taxid, phylum=as.character(phylum), species=species, 
-    stringsAsFactors=FALSE)
-  blast.out <- blast[,c(1,2,3,11)]
-  colnames(blast.out) <- c("queryId","subjectId","percIdentity","eVal")
-  blast <- cbind(blast.out,tax.out)
-  # drop taxa that do not appear a certain number of times
-  blast$taxid <- as.character(blast$taxid)
-  nt <- table(blast$taxid)
-  it <- which(nt/sum(nt) >= min.taxon)
-  itt <- which(blast$taxid %in% names(nt)[it])
-  blast <- blast[itt,]
-  cat(paste("  min taxon abundance filtering leaves",length(unique(blast$queryId)),
-    "query sequences,",length(unique(blast$phylum)),"phyla,",length(unique(blast$taxid)),"taxa\n"))
-  # only take phylum assignments that make up at least a certain 
-  # fraction ('amin') of hits to the query sequence
-  uquery <- unique(blast$queryId)
-  iquery <- match(uquery,blast$queryId)
-  # function to select the (highest) represented phylum for each query
-  iqfun <- function(i) {
-    if((i-1)%%1000==0) cat(paste(i,""))
-    myiq <- which(blast$query==uquery[i])
-    # we don't have to do the calculation if there's only one hit
-    if(length(myiq)==1) {
-      iq <- iquery[i]
-    } else {
-      # take those hits, count each phyla, take the highest abundance,
-      # check if it's above the minimum proportion of hits
-      p <- as.character(blast$phylum[myiq])
-      np <- table(p)
-      pp <- np/sum(np)
-      ip <- which.max(pp)
-      if(pp[ip] < min.query) return(numeric())
-      # use the first hit to that phylum
-      myphy <- blast$phylum[myiq]
-      iphy <- which(names(pp[ip])==myphy)
-      iq <- myiq[iphy[1]]
-    }
-    return(iq)
-  }
-  # using lapply is tempting, but haven't got it working
-  #iq <- as.numeric(lapply(1:length(uquery),iqfun))
-  #iq <- iq[!is.na(iq)]
-  if(min.query!=0) {
-    cat("  filtering by phylum representation per query... ")
-    iq <- numeric()
-    for(i in 1:length(uquery)) iq <- c(iq,iqfun(i))
-    cat("done!\n")
-    blast <- blast[iq,]
-    cat(paste("  min query representation filtering leaves",length(iq),"query sequences,",length(unique(blast$phylum)),
-      "phyla,",length(unique(blast$taxid)),"taxa\n"))
-  } else {
-    # we'll just take the first hit for each query sequence
-    if(take.first) blast <- blast[iquery,]
-  }
-  # now on to drop those phyla that are below a certain relative abundance
-  # we count NA as a real phylum
-  iNA <- which(is.na(blast$phylum))
-  blast$phylum[iNA] <- "NAphylum"
-  np <- table(blast$phylum)
-  ip <- which(np/sum(np) >= min.phylum)
-  ipp <- which(blast$phylum %in% names(np)[ip])
-  blast <- blast[ipp,]
-  cat(paste("  min phylum abundance filtering leaves",length(ipp),"query sequences,",length(unique(blast$phylum)),
-    "phyla,",length(unique(blast$taxid)),"taxa\n"))
-  return(blast)
-} 
-
-write.blast <- function(blast,outfile) {
-  # using the output of read.blast
-  # create a trimmed "blast format" output file with data only
-  # in the following columns
-  # 1 - query sequence ID
-  # 2 - hit sequence ID (i.e., FASTA defline .. extract gi number)
-  # 3 - similarity
-  # 11 - evalue
-  not <- rep(NA,length(blast[[1]]))
-  not7 <- data.frame(not,not,not,not,not,not,not)
-  c2 <- paste("gi",def2gi(blast[[2]]),sep="|")
-  out <- data.frame(blast[[1]],c2,blast[[3]],not7,blast[[11]],not)
-  write.table(out,outfile,sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE,na="")
-  return(invisible(out))
-}
-
-def2gi <- function(def) {
-  # extract gi numbers from FASTA deflines 20110131
-  stuff <- strsplit(def,"\\|")
-  gi <- sapply(1:length(stuff),function(x) {
-    # the gi number should be in the 2nd position (after "gi")
-    if(length(stuff[[x]])==1) return(stuff[[x]][1])
-    else return(stuff[[x]][2])
-  })
-  return(gi)
-}

Modified: pkg/CHNOSZ/inst/NEWS.Rd
===================================================================
--- pkg/CHNOSZ/inst/NEWS.Rd	2022-02-07 03:46:08 UTC (rev 698)
+++ pkg/CHNOSZ/inst/NEWS.Rd	2022-02-07 04:38:29 UTC (rev 699)
@@ -10,7 +10,7 @@
 \newcommand{\s}{\ifelse{latex}{\eqn{_{#1}}}{\ifelse{html}{\out{<sub>#1</sub>}}{#1}}}
 \newcommand{\S}{\ifelse{latex}{\eqn{^{#1}}}{\ifelse{html}{\out{<sup>#1</sup>}}{^#1}}}
 
-\section{Changes in CHNOSZ version 1.4.1-24 (2022-02-07)}{
+\section{Changes in CHNOSZ version 1.4.1-25 (2022-02-07)}{
 
   \subsection{PLANNED API CHANGE}{
     \itemize{
@@ -83,6 +83,10 @@
       \item Tests are now run using the \CRANpkg{tinytest} package.
 
       \item Remove \code{maxdiff()} and \code{expect_maxdiff()}.
+
+      \item Remove \code{read.blast()}, \code{id.blast()},
+      \code{write.blast()}, \code{def2gi()}, and example files in
+      \file{extdata/bison}.
       
       \item In \file{vignettes/multi-metal.Rmd} (\dQuote{Diagrams with multiple
         metals}), add a link to the associated paper

Modified: pkg/CHNOSZ/inst/TODO
===================================================================
--- pkg/CHNOSZ/inst/TODO	2022-02-07 03:46:08 UTC (rev 698)
+++ pkg/CHNOSZ/inst/TODO	2022-02-07 04:38:29 UTC (rev 699)
@@ -8,10 +8,8 @@
 
 - move protein.equil() example (previously in equilibrium.Rnw) to JMDplots.
 
-- move extdata/bison to JMDplots package.
+- add elements from Robie and Hemingway (1995).
 
-- add elements from RH95.
-
 - add check to mosaic() that 'stable' values have the right dimensions.
 
 - move H2O92D.f and R wrapper to a separate package (so people

Modified: pkg/CHNOSZ/man/CHNOSZ-package.Rd
===================================================================
--- pkg/CHNOSZ/man/CHNOSZ-package.Rd	2022-02-07 03:46:08 UTC (rev 698)
+++ pkg/CHNOSZ/man/CHNOSZ-package.Rd	2022-02-07 04:38:29 UTC (rev 699)
@@ -25,7 +25,7 @@
   \item Thermodynamic calculations: \code{\link{util.formula}}, \code{\link{makeup}}, \code{\link{util.units}}, \code{\link{eos}}, \code{\link{Berman}}, \code{\link{nonideal}}, \code{\link{util.misc}}
   \item Water properties: \code{\link{water}}, \code{\link{util.water}}, \code{\link{DEW}}, \code{\link{IAPWS95}}
   \item Protein properties: \code{\link{protein.info}}, \code{\link{add.protein}}, \code{\link{util.fasta}}, \code{\link{util.protein}}, \code{\link{util.seq}}, \code{\link{ionize.aa}}
-  \item Other tools: \code{\link{examples}}, \code{\link{eqdata}}, \code{\link{taxonomy}}, \code{\link{util.blast}}
+  \item Other tools: \code{\link{examples}}, \code{\link{eqdata}}, \code{\link{taxonomy}}
   \item Utility functions: \code{\link{util.expression}}, \code{\link{util.plot}}, \code{\link{util.array}}, \code{\link{util.list}}, \code{\link{palply}}
 }
 These concept entries are visible to \code{\link{help.search}} (aka \code{??}).
@@ -41,13 +41,11 @@
 }
 
 \section{Acknowledgements}{
-  This package would not exist without the encouragement and groudbreaking work of the late Professor Harold C. Helgeson.
-  The revised Helgeson-Kirkham-Flowers equations of state are used in this package, together with thermodynamic properties of minerals and aqueous species from many papers coauthored by Helgeson.
-  CHNOSZ uses Fortran code from \code{H2O92D.f} in the SUPCRT92 package (Johnson et al., 1992), with only minor modifications (masking of WRITE and STOP statements made for compatibility with the \R environment and keep \code{valTP} flag TRUE to permit sub-zero \degC calculations).
+  This package would not exist without the scientific influence and friendship of the late Professor Harold C. Helgeson.
+  The \file{src/H2O92D.f} file with Fortran code for calculating the thermodynamic and electrostatic properties of \H2O is modified from the \acronym{SUPCRT92} package (Johnson et al., 1992).
 
-  Work on this package at U.C. Berkeley from ca. 2003 to 2008 was supported by research grants to HCH from the U.S. National Science Foundation and Department of Energy.
-  In 2009--2011, development of this package was based upon work supported by the National Science Foundation under grant EAR-0847616.
-  The files in \code{extdata/bison} are derived from BLAST calculations made on the Saguaro high performance computer at Arizona State University. 
+  Work on CHNOSZ at U.C. Berkeley from ca. 2003 to 2008 was supported in part by research grants to HCH from the U.S. National Science Foundation and Department of Energy.
+  In 2009--2011, development of this package was partially supported by NSF grant EAR-0847616 to JMD.
 }
 
 \references{

Modified: pkg/CHNOSZ/man/extdata.Rd
===================================================================
--- pkg/CHNOSZ/man/extdata.Rd	2022-02-07 03:46:08 UTC (rev 698)
+++ pkg/CHNOSZ/man/extdata.Rd	2022-02-07 04:38:29 UTC (rev 699)
@@ -22,15 +22,6 @@
     \item The \code{testing} directory contains data files based on Berman and Aranovich (1996). These are used to demonstrate the addition of data from a user-supplied file (see \code{\link{Berman}}).
   }
 
-  Files in \code{bison} contain BLAST results and taxonomic information for an environmental metagenome from the Bison Pool hot spring in Yellowstone National Park:
-  \itemize{
-    \item \code{bisonN_vs_refseq57.blast.xz}, \code{bisonS...}, \code{bisonR...}, \code{bisonQ...}, \code{bisonP...} are partial tabular BLAST results for proteins in the Bison Pool Environmental Genome. Protein sequences predicted in the metagenome were downloaded from the Joint Genome Institute's IMG/M system on 2009-05-13. The target database for the searches was constructed from microbial protein sequences in National Center for Biotechnology Information (NCBI) RefSeq database version 57, representing 7415 microbial genomes. The \sQuote{blastall} command was used with the default setting for E value cuttoff (10.0) and options to make a tabular output file consisting of the top 20 hits for each query sequence. The function \code{\link{read.blast}} was used to extract only those hits with E values less than or equal to 1e-5 and with sequence similarity (percent identity) at least 30 percent, and to keep only the first hit for each query sequence. The function \code{\link{write.blast}} was used to save partial BLAST files (only selected columns). The files provided with CHNOSZ contain the first 1000 hits for each sampling site at Bison Pool, representing between about 1.5 to 3 percent of the first BLAST hits after similarity and E value filtering.
-    \item \code{gi.taxid.txt.xz} is a table that lists the sequence identifiers (gi numbers) that appear in the example BLAST files (see above), together with the corresponding taxon ids used in the NCBI databases.
-    \item \code{taxid_names.csv.xz} A table of scientific names for the taxids in \code{gi.taxid.txt.xz}.
-    See \code{\link{id.blast}} for examples that use these files.
-  }
-
-
   Files in \code{cpetc} contain experimental and calculated thermodynamic and environmental data:
   \itemize{
     \item \code{PM90.csv} Heat capacities of four unfolded aqueous proteins taken from Privalov and Makhatadze, 1990. Temperature in \degC is in the first column, and heat capacities of the proteins in J mol\eqn{^{-1}}{^-1} K\eqn{^{-1}}{^-1} in the remaining columns. See \code{\link{ionize.aa}} and the vignette \viglink{anintro} for examples that use this file.

Deleted: pkg/CHNOSZ/man/util.blast.Rd
===================================================================
--- pkg/CHNOSZ/man/util.blast.Rd	2022-02-07 03:46:08 UTC (rev 698)
+++ pkg/CHNOSZ/man/util.blast.Rd	2022-02-07 04:38:29 UTC (rev 699)
@@ -1,117 +0,0 @@
-\encoding{UTF-8}
-\name{util.blast}
-\alias{util.blast}
-\alias{read.blast}
-\alias{id.blast}
-\alias{write.blast}
-\alias{def2gi}
-\title{Functions to Work with BLAST Output Files}
-
-\description{
-  Read and filter BLAST tabular output files, make taxonomic identifications of the BLAST hits using gi numbers, write trimmed-down BLAST files. 
-}
-
-\usage{
-  read.blast(file, similarity = 30, evalue = 1e-5, max.hits = 1, 
-    min.length = NA, quiet = FALSE)
-  id.blast(blast, gi.taxid, taxid.names, min.taxon = 0, 
-    min.query = 0, min.phylum = 0, take.first = TRUE)
-  write.blast(blast, outfile)
-  def2gi(def)
-}
-
-\arguments{
-  \item{file}{character, name of BLAST tabular output file}
-  \item{similarity}{numeric, hits above this similarity score are kept}
-  \item{evalue}{character, hits below this E value are kept}
-  \item{max.hits}{numeric, up to this many hits are kept for each query sequence}
-  \item{min.length}{numeric, hits with at least this alignment length are kept}
-  \item{quiet}{logical, produce fewer messages?}
-  \item{blast}{dataframe, BLAST table}
-  \item{gi.taxid}{list, first component is sequence identifiers (gi numbers), second is taxon ids (taxids)}
-  \item{taxid.names}{dataframe, with at least columns \samp{taxid} (taxon id), \samp{phylum} (name of phylum), \samp{species} (name of species)}
-  \item{min.taxon}{numeric, this taxon is kept if it makes up at least this fraction of total}
-  \item{min.query}{numeric, query sequence is counted if a single phylum makes up this fraction of its hits}
-  \item{min.phylum}{numeric, this phylum is kept if it makes up at least this fraction of total}
-  \item{take.first}{logical, keep only first hit after all other filtering steps?}
-  \item{outfile}{character, name of output file}
-  \item{def}{character, FASTA defline(s)}
-}
-
-\details{
-
-  \code{read.blast} reads a BLAST (Altschul et al., 1997) tabular output \code{file} (such as generated using the -m 8 switch to the \sQuote{blastall} command), keeping only those hits with greater than or equal to \code{similarity} and less than or equal to \code{evalue} (expectation value). Furthermore, for each query sequence, only the top number of hits specified by \code{max.hits} are kept, and only hits with an alignment length of at least \code{min.length} are kept. One or more of these filters can be disabled by setting \code{similarity}, \code{evalue} and/or \code{max.hits} to NA. 
-
-  \code{id.blast} takes a BLAST table (i.e., the output of \code{read.blast}) and finds the taxonomic ID, phylum and species name for each hit (subject sequence). The BLAST results are tied to taxids using \code{gi.taxid}, which is a list consisting of \samp{gi} and \samp{taxid} numeric vectors. Any subject sequence identifiers appearing in the BLAST file that do not match gi numbers in the \code{gi.taxid} list are dropped. The \code{taxid.names} dataframe lists the phylum and species names for each taxid.
-
-  \code{id.blast} furthermore performs three possible filtering steps, which are all disabled by default. If one or more of the arguments is set to a non-zero value, its operation is performed, in this order. Any taxon that does not initially make up at least the fraction of total hits given by \code{min.taxon} is removed. Any query sequence that does not have a single phylum making up at least the fraction of hits (for each query sequence) given by \code{min.query} is removed. Finally, any phylum that does not make up at least the fraction of total hits given by \code{min.phylum} is removed.
-
-  By default, for \code{take.first} equal to TRUE, \code{id.blast} performs a final filtering step (but \code{min.query} must be disabled). Only the first hit for each query sequence is kept.
-
-\code{write.blast} takes a BLAST table (the output of \code{read.blast}) and writes to \code{outfile} a stripped-down BLAST file with empty values in the columns except for columns 1 (query sequence ID), 2 (hit sequence ID), 3 (similarity), 11 (E value).
-In the process, \code{\link{def2gi}} is used to extract the GI numbers for the hit sequences that are then kept in the second column.
-This function is used to reduce the size of the example BLAST files that are packaged with CHNOSZ (see the \sQuote{bison} section in \code{\link{extdata}}).
-
-  \code{def2gi} extracts the GI number from a FASTA defline. 
-
-}
-
-\value{
-  \code{read.blast} returns a dataframe with as many columns (12) as the BLAST file. \code{id.blast} returns a dataframe with columns \code{query}, \code{subject} (i.e., sequence id or gi number), \code{similarity}, \code{evalue}, \code{taxid}, \code{phylum} and \code{species}. \code{write.blast} \code{\link{invisible}}-y returns the results (that are also written to \code{outfile}).
-}
-
-\examples{
-## using def2gi
-def <- "gi|218295810|ref|ZP_03496590.1|"
-def2gi(def)  # "218295810"
-
-## process some of the BLAST output for proteins
-## from Bison Pool metagenome (JGI, 2007)
-# read the file that connects taxids with the sequence identifier
-tfile <- system.file("extdata/bison/gi.taxid.txt.xz", package="CHNOSZ")
-gi.taxid <- scan(tfile, what=as.list(character(2)), flush=TRUE)
-# read the file that connects names with the taxids
-nfile <- system.file("extdata/bison/taxid_names.csv.xz", package="CHNOSZ")
-taxid.names <- read.csv(nfile)
-# the BLAST files
-sites <- c("N","S","R","Q","P")
-bfile <- paste("extdata/bison/bison", sites, "_vs_refseq57.blastp.xz", sep="")
-for(i in 1:5) {
-  file <- system.file(bfile[i], package="CHNOSZ")
-  # read the blast file, with default filtering settings
-  bl <- read.blast(file)
-  # process the blast file -- get taxon names
-  ib <- id.blast(bl, gi.taxid, taxid.names, min.taxon=2^-7)
-  # count each of the phyla
-  bd <- as.matrix(sapply(unique(ib$phylum), function(x) (sum(x==ib$phylum))))
-  colnames(bd) <- sites[i]
-  # make a matrix -- each column for a different file
-  if(i==1) bardata <- bd else {
-    bardata <- merge(bardata, bd, all=TRUE, by="row.names")
-    rownames(bardata) <- bardata$Row.names
-    bardata <- bardata[,-1]
-  }
-}
-# normalize the counts
-bardata[is.na(bardata)] <- 0
-bardata <- t(t(bardata)/colSums(bardata))
-# make a bar chart
-bp <- barplot(as.matrix(bardata), col=rainbow(nrow(bardata)),
-  xlab="location", ylab="fractional abundance")
-# add labels to the bars
-names <- substr(row.names(bardata), 1, 3)
-for(i in 1:5) {
-  bd <- bardata[,i]
-  ib <- bd!=0
-  y <- (cumsum(bd) - bd/2)[ib]
-  text(bp[i], y, names[ib])
-}
-title(main=paste("Phylum Classification of Protein Sequences",
-  "in Part of the Bison Pool Metagenome", sep="\n"))
-}
-
-\references{
-  Altschul, S. F., Madden, T. L., Schaffer, A. A., Zhang, J. H., Zhang, Z., Miller, W. and Lipman, D. J. (1997) Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. \emph{Nucleic Acids Res.} \bold{25}, 3389--3402. \doi{10.1093/nar/25.17.3389} 
-}
-
-\concept{Other tools}



More information about the CHNOSZ-commits mailing list