[CHNOSZ-commits] r121 - in pkg/CHNOSZ: . R inst man

Sat Jan 21 13:53:11 CET 2017

Author: jedick
Date: 2017-01-21 13:53:11 +0100 (Sat, 21 Jan 2017)
New Revision: 121

Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/util.fasta.R
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/man/util.fasta.Rd
Log:
read.fasta() extracts information from UniProt FASTA headers


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================

--- pkg/CHNOSZ/DESCRIPTION	2017-01-07 07:20:14 UTC (rev 120)
+++ pkg/CHNOSZ/DESCRIPTION	2017-01-21 12:53:11 UTC (rev 121)
@@ -1,6 +1,6 @@
-Date: 2017-01-07
+Date: 2017-01-21
 Package: CHNOSZ
-Version: 1.0.8-9
+Version: 1.0.8-10
 Title: Chemical Thermodynamics and Activity Diagrams
 Author: Jeffrey Dick
 Maintainer: Jeffrey Dick <j3ffdick at gmail.com>

Modified: pkg/CHNOSZ/R/util.fasta.R
===================================================================
--- pkg/CHNOSZ/R/util.fasta.R	2017-01-07 07:20:14 UTC (rev 120)
+++ pkg/CHNOSZ/R/util.fasta.R	2017-01-21 12:53:11 UTC (rev 121)
@@ -108,6 +108,7 @@
   organism <- bnf
   # protein/gene name is from header line for entry
   # (strip the ">" and go to the first space)
+  missid <- missing(id)
   if(is.null(id)) id <- as.character(palply("", 1:length(i), function(j) {
     # get the text of the line
     f1 <- linefun(i[j],i[j])
@@ -129,8 +130,19 @@
       # 20090507 made stringsAsFactors FALSE
       out <- cbind(data.frame(protein=id, organism=organism,
         ref=ref, abbrv=abbrv, chains=chains, stringsAsFactors=FALSE), counts)
+      # 20170117 extra processing for files from UniProt
+      isUniProt <- grepl("\\|......\\|.*_", out$protein[1])
+      if(isUniProt & missid) {
+        p1 <- sapply(strsplit(out$protein, "\\|"), "[", 1)
+        p2 <- sapply(strsplit(out$protein, "\\|"), "[", 2)
+        p3 <- sapply(strsplit(out$protein, "\\|"), "[", 3)
+        out$abbrv <- sapply(strsplit(p3, "_"), "[", 1)
+        out$organism <- sapply(strsplit(p3, "_"), "[", 2)
+        out$protein <- paste0(p1, "|", p2)
+      }
+      out
     } else if(type %in% c("DNA", "RNA")) {
-      out <- cbind(data.frame(gene=id, organism=organism,
+      cbind(data.frame(gene=id, organism=organism,
         ref=ref, abbrv=abbrv, chains=chains, stringsAsFactors=FALSE), counts)
     }
   } else return(sequences)

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2017-01-07 07:20:14 UTC (rev 120)
+++ pkg/CHNOSZ/inst/NEWS	2017-01-21 12:53:11 UTC (rev 121)
@@ -1,5 +1,5 @@
-CHANGES IN CHNOSZ 1.0.8-8 (2017-01-07)
---------------------------------------
+CHANGES IN CHNOSZ 1.0.8-10 (2017-01-21)
+---------------------------------------
 
 - Add "AA" as a keyword for preset species in basis() (cysteine,
   glutamic acid, glutamine, H2O, oxygen).
@@ -27,6 +27,10 @@
 - Correct charge (-2) of NAD(red)-2 in OBIGT.csv. Thanks to Peter
   Canovas.
 
+- read.fasta() extracts information from UniProt FASTA headers
+  (accession, name, organism) into columns of the output data frame
+  (protein, abbrv, organism).
+
 CHANGES IN CHNOSZ 1.0.8 (2016-05-28)
 ------------------------------------
 

Modified: pkg/CHNOSZ/man/util.fasta.Rd
===================================================================
--- pkg/CHNOSZ/man/util.fasta.Rd	2017-01-07 07:20:14 UTC (rev 120)
+++ pkg/CHNOSZ/man/util.fasta.Rd	2017-01-21 12:53:11 UTC (rev 121)
@@ -55,6 +55,7 @@
 If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}. 
 Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to "").
 When \code{ret} is \samp{count}, the names of the proteins in the resulting data frame are parsed from the header lines of the file, unless \code{id} is provided.
+If {id} is not given, and a UniProt FASTA header is detected (regular expression \code{"\\|......\\|.*_"}), information there (accession, name, organism) is split into the \code{protein}, \code{abbrv}, and {organism} columns of the resulting data frame.
 
 \code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}).
 For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}.