[CHNOSZ-commits] r56 - in pkg/CHNOSZ: . R inst inst/extdata/protein man

Sun Sep 22 12:22:26 CEST 2013

Author: jedick
Date: 2013-09-22 12:22:25 +0200 (Sun, 22 Sep 2013)
New Revision: 56

Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/more.aa.R
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/inst/extdata/protein/Sce.csv.xz
   pkg/CHNOSZ/man/extdata.Rd
Log:
update extdata/protein/Sce.csv.xz


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2013-07-04 16:15:01 UTC (rev 55)
+++ pkg/CHNOSZ/DESCRIPTION	2013-09-22 10:22:25 UTC (rev 56)
@@ -1,9 +1,9 @@
-Date: 2013-07-04
+Date: 2013-09-22
 Package: CHNOSZ
-Version: 1.0.1
+Version: 1.0.1-1
 Title: Chemical Thermodynamics and Activity Diagrams
 Author: Jeffrey M. Dick
-Maintainer: Jeffrey M. Dick <j3ffdick at gmail.com>
+Maintainer: Jeffrey Dick <j3ffdick at gmail.com>
 Depends: R (>= 2.12.0), utils
 Suggests: limSolve, parallel, testthat
 Description: This package includes functions and data sets to support chemical thermodynamic 

Modified: pkg/CHNOSZ/R/more.aa.R
===================================================================
--- pkg/CHNOSZ/R/more.aa.R	2013-07-04 16:15:01 UTC (rev 55)
+++ pkg/CHNOSZ/R/more.aa.R	2013-09-22 10:22:25 UTC (rev 56)
@@ -9,16 +9,14 @@
   datapath <- paste("extdata/protein/", organism, ".csv.xz", sep="")
   datafile <- system.file(datapath, package="CHNOSZ")
   if(datafile=="") stop(paste("missing", datapath))
-  mydata <- read.csv(datafile)
+  mydata <- read.csv(datafile, as.is=TRUE)
   # if protein is not supplied, just give some information about the datafile
   if(is.null(protein)) {
     msgout("more.aa: ", datapath, " has data for ", nrow(mydata), " proteins\n")
     return(invisible())
   }
   # which columns to search for matches
-  # include "OLN" (Sce.csv dated 2008-08-04)
-  # and "ORF" (Sce.csv dated 2013-06-04)
-  if(organism=="Sce") searchcols <- c("OLN", "ORF", "SGDID")
+  if(organism=="Sce") searchcols <- c("ORF", "SGDID", "GENE")
   else if(organism=="Eco") searchcols <- c("protein", "abbrv")
   # which columns have the amino acids, in the order of thermo$protein 
   iaa <- match(toupper(aminoacids(3)), toupper(colnames(mydata)))
@@ -47,12 +45,14 @@
     }
     aa <- data.frame(mydata[imatch, iaa])
     # add the identifying columns
-    organism <- rep(organism[[1]], length(protein[[i]]))
-    abbrv <- rep(NA, length(protein[[i]]))
-    ref <- rep(NA, length(protein[[i]]))
+    if(organism=="Sce") ref <- mydata$SGDID[imatch]
+    else ref <- rep(NA, length(protein[[i]]))
+    if(organism=="Sce") abbrv <- mydata$GENE[imatch]
+    else abbrv <- rep(NA, length(protein[[i]]))
     chains <- rep(1, length(protein[[i]]))
     chains[inotmatch] <- NA
-    precols <- data.frame(protein[[i]], organism, ref, abbrv, chains, stringsAsFactors=FALSE)
+    org <- rep(organism[[1]], length(protein[[i]]))
+    precols <- data.frame(protein[[i]], organism=org, ref, abbrv, chains, stringsAsFactors=FALSE)
     colnames(precols)[1] <- "protein"
     colnames(aa) <- aminoacids(3)
     aa <- cbind(precols, aa)

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2013-07-04 16:15:01 UTC (rev 55)
+++ pkg/CHNOSZ/inst/NEWS	2013-09-22 10:22:25 UTC (rev 56)
@@ -1,3 +1,11 @@
+CHANGES IN CHNOSZ 1.0.1-1 (2013-09-22)
+--------------------------------------
+
+- Updated extdata/protein/Sce.csv.xz using Saccharomyces Genome Database
+  protein_properties.tab and SGD_features.tab dated 2013-08-24.
+
+- more.aa() includes SGDID and gene name in the abbrv and ref columns.
+
 CHANGES IN CHNOSZ 1.0.1 (2013-07-04)
 ------------------------------------
 

Modified: pkg/CHNOSZ/inst/extdata/protein/Sce.csv.xz
===================================================================
(Binary files differ)

Modified: pkg/CHNOSZ/man/extdata.Rd
===================================================================
--- pkg/CHNOSZ/man/extdata.Rd	2013-07-04 16:15:01 UTC (rev 55)
+++ pkg/CHNOSZ/man/extdata.Rd	2013-09-22 10:22:25 UTC (rev 56)
@@ -43,11 +43,17 @@
     \item \code{EF-Tu.aln} consists of aligned sequences (394 amino acids) of elongation factor Tu (EF-Tu). The sequences correspond to those taken from UniProtKB for ECOLI (\emph{Escherichia coli}), THETH (\emph{Thermus thermophilus}) and THEMA (\emph{Thermotoga maritima}), and reconstructed ancestral sequences taken from Gaucher et al., 2003 (maximum likelihood bacterial stem and mesophilic bacterial stem, and alternative bacterial stem). See the \sQuote{formation} vignette for an example that uses this file.
   }
 
-  Files in \code{protein} contain protein composition data for model organisms:
+  Files in \code{protein} contain protein composition data for model organisms.
+  See \code{\link{more.aa}} and \code{\link{read.expr}} for examples that use these files.
   \itemize{
-    \item \code{Sce.csv.xz} Data frame of amino acid composition of proteins from the \emph{Saccharomyces} Genome Database. Contains twenty-two columns. Values in the first column are the rownumbers, the second column (\code{OLN}) has the ordered locus names of proteins, and the remaining twenty columns (\code{Ala}..\code{Val}) contain the numbers of the respective amino acids in each protein; the columns are arranged in alphabetical order based on the three-letter abbreviations for the amino acids. The source of data for \samp{Sce.csv} is the file \samp{protein_properties.tab} found on the FTP site of the SGD project on 2008-08-04. Blank entries were replaced with "NA" and column headings were added. See \code{\link{read.expr}} for examples that use this file.
-    \item \code{Eco.csv.xz} Contains 24 columns. Values in the first column correspond to rownumbers, the second column {\code{AC}} holds the accession numbers of the proteins, the third column (\code{Name}) has the names of the corresponding genes, and the fourth column {\code{OLN}} lists the ordered locus names of the proteins. The remaining twenty columns (\code{A}..\code{Y}) give the numbers of the respective amino acids in each protein and are ordered alphabetically by the one-letter abbreviations of the amino acids. The sources of data for \samp{Eco.csv} are the files \samp{ECOLI.dat} \url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/entries/bacteria} and \samp{ECOLI.fas} \url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/fasta/bacteria} downloaded from the HAMAP (High-quality Automated and Manual Annotation of microbial Proteomes system) FTP site (Gattiker et al., 2003) on 2007-12-20. The proteins can be included in calculations using \code{\link{more.aa}} as well as \code{\link{read.expr}}.
-%%    \item \code{HUM.csv.xz} Downloaded the file \code{uniprot_sprot_human.dat.gz}, dated 2010-08-10, from \url{ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/}, converted from UniProt to FASTA format using the \code{seqret} tool from EMBOSS (\url{http://emboss.sourceforge.net/}). Counted amino acid frequencies using \code{\link{read.fasta}}. Columns are as described in \code{thermo$protein}, except column \code{protein} and \code{abbrv} contain Swiss-Prot name and accession number, respectively (both taken from the header lines in the FASTA file).
+    \item \code{Sce.csv.xz}
+      Data frame of amino acid composition of 6716 proteins from the \emph{Saccharomyces} Genome Database (SGD).
+      Values in the first three columns are the \code{ORF} names of proteins, \code{SGDID}, and \code{GENE} names. The remaining twenty columns (\code{ALA}..\code{VAL}) contain the numbers of the respective amino acids in each protein.
+      The sources of data for \samp{Sce.csv} are the files \samp{protein_properties.tab} and \samp{SGD_features.tab} (for the gene names), downloaded from \url{http://www.yeastgenome.org} on 2013-08-24.
+    \item \code{Eco.csv.xz}
+      Amino acid compositions of 4407 proteins in \emph{Escherichia coli} strain K12.
+      Format is the one used \code{\link{thermo}$protein}, with columns \samp{protein} holding the gene name, \samp{organism} set to \samp{ECOLI}, and \code{abbrv} holding the UniProt ID.
+      The source of data is the file \samp{ECOLI.fas} downloaded from the HAMAP (High-quality Automated and Manual Annotation of microbial Proteomes system) FTP site (Gattiker et al., 2003) (\url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/fasta/bacteria}) on 2010-09-25.
   }