From noreply at r-forge.r-project.org Sun Sep 22 12:22:26 2013 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 22 Sep 2013 12:22:26 +0200 (CEST) Subject: [CHNOSZ-commits] r56 - in pkg/CHNOSZ: . R inst inst/extdata/protein man Message-ID: <20130922102226.27EFD183C48@r-forge.r-project.org> Author: jedick Date: 2013-09-22 12:22:25 +0200 (Sun, 22 Sep 2013) New Revision: 56 Modified: pkg/CHNOSZ/DESCRIPTION pkg/CHNOSZ/R/more.aa.R pkg/CHNOSZ/inst/NEWS pkg/CHNOSZ/inst/extdata/protein/Sce.csv.xz pkg/CHNOSZ/man/extdata.Rd Log: update extdata/protein/Sce.csv.xz Modified: pkg/CHNOSZ/DESCRIPTION =================================================================== --- pkg/CHNOSZ/DESCRIPTION 2013-07-04 16:15:01 UTC (rev 55) +++ pkg/CHNOSZ/DESCRIPTION 2013-09-22 10:22:25 UTC (rev 56) @@ -1,9 +1,9 @@ -Date: 2013-07-04 +Date: 2013-09-22 Package: CHNOSZ -Version: 1.0.1 +Version: 1.0.1-1 Title: Chemical Thermodynamics and Activity Diagrams Author: Jeffrey M. Dick -Maintainer: Jeffrey M. Dick +Maintainer: Jeffrey Dick Depends: R (>= 2.12.0), utils Suggests: limSolve, parallel, testthat Description: This package includes functions and data sets to support chemical thermodynamic Modified: pkg/CHNOSZ/R/more.aa.R =================================================================== --- pkg/CHNOSZ/R/more.aa.R 2013-07-04 16:15:01 UTC (rev 55) +++ pkg/CHNOSZ/R/more.aa.R 2013-09-22 10:22:25 UTC (rev 56) @@ -9,16 +9,14 @@ datapath <- paste("extdata/protein/", organism, ".csv.xz", sep="") datafile <- system.file(datapath, package="CHNOSZ") if(datafile=="") stop(paste("missing", datapath)) - mydata <- read.csv(datafile) + mydata <- read.csv(datafile, as.is=TRUE) # if protein is not supplied, just give some information about the datafile if(is.null(protein)) { msgout("more.aa: ", datapath, " has data for ", nrow(mydata), " proteins\n") return(invisible()) } # which columns to search for matches - # include "OLN" (Sce.csv dated 2008-08-04) - # and "ORF" (Sce.csv dated 2013-06-04) - if(organism=="Sce") searchcols <- c("OLN", "ORF", "SGDID") + if(organism=="Sce") searchcols <- c("ORF", "SGDID", "GENE") else if(organism=="Eco") searchcols <- c("protein", "abbrv") # which columns have the amino acids, in the order of thermo$protein iaa <- match(toupper(aminoacids(3)), toupper(colnames(mydata))) @@ -47,12 +45,14 @@ } aa <- data.frame(mydata[imatch, iaa]) # add the identifying columns - organism <- rep(organism[[1]], length(protein[[i]])) - abbrv <- rep(NA, length(protein[[i]])) - ref <- rep(NA, length(protein[[i]])) + if(organism=="Sce") ref <- mydata$SGDID[imatch] + else ref <- rep(NA, length(protein[[i]])) + if(organism=="Sce") abbrv <- mydata$GENE[imatch] + else abbrv <- rep(NA, length(protein[[i]])) chains <- rep(1, length(protein[[i]])) chains[inotmatch] <- NA - precols <- data.frame(protein[[i]], organism, ref, abbrv, chains, stringsAsFactors=FALSE) + org <- rep(organism[[1]], length(protein[[i]])) + precols <- data.frame(protein[[i]], organism=org, ref, abbrv, chains, stringsAsFactors=FALSE) colnames(precols)[1] <- "protein" colnames(aa) <- aminoacids(3) aa <- cbind(precols, aa) Modified: pkg/CHNOSZ/inst/NEWS =================================================================== --- pkg/CHNOSZ/inst/NEWS 2013-07-04 16:15:01 UTC (rev 55) +++ pkg/CHNOSZ/inst/NEWS 2013-09-22 10:22:25 UTC (rev 56) @@ -1,3 +1,11 @@ +CHANGES IN CHNOSZ 1.0.1-1 (2013-09-22) +-------------------------------------- + +- Updated extdata/protein/Sce.csv.xz using Saccharomyces Genome Database + protein_properties.tab and SGD_features.tab dated 2013-08-24. + +- more.aa() includes SGDID and gene name in the abbrv and ref columns. + CHANGES IN CHNOSZ 1.0.1 (2013-07-04) ------------------------------------ Modified: pkg/CHNOSZ/inst/extdata/protein/Sce.csv.xz =================================================================== (Binary files differ) Modified: pkg/CHNOSZ/man/extdata.Rd =================================================================== --- pkg/CHNOSZ/man/extdata.Rd 2013-07-04 16:15:01 UTC (rev 55) +++ pkg/CHNOSZ/man/extdata.Rd 2013-09-22 10:22:25 UTC (rev 56) @@ -43,11 +43,17 @@ \item \code{EF-Tu.aln} consists of aligned sequences (394 amino acids) of elongation factor Tu (EF-Tu). The sequences correspond to those taken from UniProtKB for ECOLI (\emph{Escherichia coli}), THETH (\emph{Thermus thermophilus}) and THEMA (\emph{Thermotoga maritima}), and reconstructed ancestral sequences taken from Gaucher et al., 2003 (maximum likelihood bacterial stem and mesophilic bacterial stem, and alternative bacterial stem). See the \sQuote{formation} vignette for an example that uses this file. } - Files in \code{protein} contain protein composition data for model organisms: + Files in \code{protein} contain protein composition data for model organisms. + See \code{\link{more.aa}} and \code{\link{read.expr}} for examples that use these files. \itemize{ - \item \code{Sce.csv.xz} Data frame of amino acid composition of proteins from the \emph{Saccharomyces} Genome Database. Contains twenty-two columns. Values in the first column are the rownumbers, the second column (\code{OLN}) has the ordered locus names of proteins, and the remaining twenty columns (\code{Ala}..\code{Val}) contain the numbers of the respective amino acids in each protein; the columns are arranged in alphabetical order based on the three-letter abbreviations for the amino acids. The source of data for \samp{Sce.csv} is the file \samp{protein_properties.tab} found on the FTP site of the SGD project on 2008-08-04. Blank entries were replaced with "NA" and column headings were added. See \code{\link{read.expr}} for examples that use this file. - \item \code{Eco.csv.xz} Contains 24 columns. Values in the first column correspond to rownumbers, the second column {\code{AC}} holds the accession numbers of the proteins, the third column (\code{Name}) has the names of the corresponding genes, and the fourth column {\code{OLN}} lists the ordered locus names of the proteins. The remaining twenty columns (\code{A}..\code{Y}) give the numbers of the respective amino acids in each protein and are ordered alphabetically by the one-letter abbreviations of the amino acids. The sources of data for \samp{Eco.csv} are the files \samp{ECOLI.dat} \url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/entries/bacteria} and \samp{ECOLI.fas} \url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/fasta/bacteria} downloaded from the HAMAP (High-quality Automated and Manual Annotation of microbial Proteomes system) FTP site (Gattiker et al., 2003) on 2007-12-20. The proteins can be included in calculations using \code{\link{more.aa}} as well as \code{\link{read.expr}}. -%% \item \code{HUM.csv.xz} Downloaded the file \code{uniprot_sprot_human.dat.gz}, dated 2010-08-10, from \url{ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/}, converted from UniProt to FASTA format using the \code{seqret} tool from EMBOSS (\url{http://emboss.sourceforge.net/}). Counted amino acid frequencies using \code{\link{read.fasta}}. Columns are as described in \code{thermo$protein}, except column \code{protein} and \code{abbrv} contain Swiss-Prot name and accession number, respectively (both taken from the header lines in the FASTA file). + \item \code{Sce.csv.xz} + Data frame of amino acid composition of 6716 proteins from the \emph{Saccharomyces} Genome Database (SGD). + Values in the first three columns are the \code{ORF} names of proteins, \code{SGDID}, and \code{GENE} names. The remaining twenty columns (\code{ALA}..\code{VAL}) contain the numbers of the respective amino acids in each protein. + The sources of data for \samp{Sce.csv} are the files \samp{protein_properties.tab} and \samp{SGD_features.tab} (for the gene names), downloaded from \url{http://www.yeastgenome.org} on 2013-08-24. + \item \code{Eco.csv.xz} + Amino acid compositions of 4407 proteins in \emph{Escherichia coli} strain K12. + Format is the one used \code{\link{thermo}$protein}, with columns \samp{protein} holding the gene name, \samp{organism} set to \samp{ECOLI}, and \code{abbrv} holding the UniProt ID. + The source of data is the file \samp{ECOLI.fas} downloaded from the HAMAP (High-quality Automated and Manual Annotation of microbial Proteomes system) FTP site (Gattiker et al., 2003) (\url{ftp://ftp.expasy.org/databases/hamap/complete_proteomes/fasta/bacteria}) on 2010-09-25. }