[CHNOSZ-commits] r53 - in pkg/CHNOSZ: . R inst inst/tests man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Sun Jun 2 07:15:42 CEST 2013


Author: jedick
Date: 2013-06-02 07:15:42 +0200 (Sun, 02 Jun 2013)
New Revision: 53

Modified:
   pkg/CHNOSZ/DESCRIPTION
   pkg/CHNOSZ/R/protein.info.R
   pkg/CHNOSZ/R/util.fasta.R
   pkg/CHNOSZ/R/util.seq.R
   pkg/CHNOSZ/inst/NEWS
   pkg/CHNOSZ/inst/tests/test-util.seq.R
   pkg/CHNOSZ/man/util.fasta.Rd
   pkg/CHNOSZ/man/util.seq.Rd
Log:
count.aa() and read.fasta() handle DNA sequences


Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================
--- pkg/CHNOSZ/DESCRIPTION	2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/DESCRIPTION	2013-06-02 05:15:42 UTC (rev 53)
@@ -1,6 +1,6 @@
-Date: 2013-05-06
+Date: 2013-06-02
 Package: CHNOSZ
-Version: 1.0.0-1
+Version: 1.0.0-2
 Title: Chemical Thermodynamics and Activity Diagrams
 Author: Jeffrey M. Dick
 Maintainer: Jeffrey M. Dick <j3ffdick at gmail.com>

Modified: pkg/CHNOSZ/R/protein.info.R
===================================================================
--- pkg/CHNOSZ/R/protein.info.R	2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/R/protein.info.R	2013-06-02 05:15:42 UTC (rev 53)
@@ -79,8 +79,8 @@
 protein.length <- function(protein, organism=NULL) {
   # calculate the length(s) of proteins
   aa <- ip2aa(protein, organism)
-  # loop over the proteins
-  pl <- sapply(1:nrow(aa), function(i) sum(aa[i, 6:25]))
+  # use rowSums on the columns containing amino acid counts
+  pl <- as.numeric(rowSums(aa[, 6:25]))
   return(pl)
 }
 

Modified: pkg/CHNOSZ/R/util.fasta.R
===================================================================
--- pkg/CHNOSZ/R/util.fasta.R	2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/R/util.fasta.R	2013-06-02 05:15:42 UTC (rev 53)
@@ -45,22 +45,19 @@
 }
 
 read.fasta <- function(file, i=NULL, ret="count", lines=NULL, ihead=NULL,
-  pnff=FALSE, start=NULL, stop=NULL) {
+  start=NULL, stop=NULL, type="protein") {
   # read sequences from a fasta file
-  if(file != "") msgout("read.fasta: reading ",basename(file),"\n")
-  # all of them or only those indicated by i
-  # if aa=TRUE compile a data frame of the amino acid
-  # compositions suitable for add.protein
   # some of the following code was adapted from 
   # read.fasta in package seqinR
-  # TODO: better test for type of system
+  # value of 'i' is what sequences to read 
   # value of 'ret' determines format of return value:
-  # aa: amino acid composition (same columns as thermo$protein)
+  # count: amino acid composition (same columns as thermo$protein, can be used by add.protein)
+  #        or nucleic acid base composition (A-C-G-T)
   # seq: amino acid sequence
   # fas: fasta entry
-  # pnff: take the protein name from filename (TRUE) or entry name (FALSE)
   is.nix <- Sys.info()[[1]]=="Linux"
   if(is.nix & is.null(lines)) {
+    msgout("read.fasta: reading ",basename(file),"\n")
     # figure out whether to use 'cat', 'zcat' or 'xzcat'
     suffix <- substr(file,nchar(file)-2,nchar(file))
     if(suffix==".gz") mycat <- "zcat"
@@ -72,7 +69,10 @@
     lines <- system(paste(mycat,' "',file,'"',sep=""),intern=TRUE)
     linefun <- function(i1,i2) lines[i1:i2]
   } else {
-    if(is.null(lines)) lines <- readLines(file)
+    if(is.null(lines)) {
+      lines <- readLines(file)
+      msgout("read.fasta: reading ",basename(file),"\n")
+    }
     nlines <- length(lines)
     if(is.null(ihead)) ihead <- which(substr(lines,1,1)==">")
     linefun <- function(i1,i2) lines[i1:i2]
@@ -97,52 +97,42 @@
     for(i in 1:length(begin)) iline <- c(iline,(begin[i]-1):end[i])
     return(lines[iline])
   }
-  # get each sequences from the begin to end lines
+  # get each sequence from the begin to end lines
   seqfun <- function(i) paste(linefun(begin[i],end[i]),collapse="")
   sequences <- lapply(1:length(i), seqfun)
-  # process the header line for each entry
-  # (strip the ">" and go to the first space or underscore)
-  nomfun <- function(befund) {
-    nomnomfun <- function(j,pnff) {
-      # get the text of the line
-      f1 <- linefun(i[j],i[j])
-      # stop if the first character is not ">"
-      # or the first two charaters are "> "
-      if(substr(f1,1,1)!=">" | length(grep("^> ",f1)>0))
-        stop(paste("file",basename(file),"line",j,"doesn't begin with FASTA header '>'."))
-      # discard the leading '>'
-      f2 <- substr(f1, 2, nchar(f1))
-      # keep everything before the first space
-      f3 <- strsplit(f2," ")[[1]][1]
-      # then before or after the first underscore
-      if(befund) f4 <- strsplit(f3,"_")[[1]][1]
-      else f4 <- strsplit(f3,"_")[[1]][2]
-      return(f4)
-    }
-    noms <- as.character(palply(1:length(i),nomnomfun))
-    return(noms)
-  }
-  # process the file name
+  # organism name is from file name
   # (basename minus extension)
   bnf <- strsplit(basename(file),split=".",fixed=TRUE)[[1]][1]
-  if(pnff) {
-    # protein name is from file name
-    # organism name is from entry
-    protein <- bnf
-    organism <- nomfun(befund=FALSE)
-  } else {
-    # vice versa
-    protein <- nomfun(befund=TRUE)
-    organism <- bnf
-  }
+  organism <- bnf
+  # protein/gene name is from header line for entry
+  # (strip the ">" and go to the first space or underscore)
+  id <- as.character(palply(1:length(i), function(j) {
+    # get the text of the line
+    f1 <- linefun(i[j],i[j])
+    # stop if the first character is not ">"
+    # or the first two charaters are "> "
+    if(substr(f1,1,1)!=">" | length(grep("^> ",f1)>0))
+      stop(paste("file",basename(file),"line",j,"doesn't begin with FASTA header '>'."))
+    # discard the leading '>'
+    f2 <- substr(f1, 2, nchar(f1))
+    # keep everything before the first space
+    f3 <- strsplit(f2," ")[[1]][1]
+    # then before or after the first underscore
+    return(strsplit(f3,"_")[[1]][1])
+  } ))
   if(ret=="count") {
-    aa <- count.aa(sequences, start, stop)
-    colnames(aa) <- aminoacids(3)
+    counts <- count.aa(sequences, start, stop, type)
     ref <- abbrv <- NA
     chains <- 1
-    # 20090507 made stringsAsFactors FALSE
-    return(cbind(data.frame(protein=protein,organism=organism,
-      ref=ref,abbrv=abbrv,chains=chains,stringsAsFactors=FALSE),aa))
+    if(type=="protein") {
+      colnames(counts) <- aminoacids(3)
+      # 20090507 made stringsAsFactors FALSE
+      out <- cbind(data.frame(protein=id, organism=organism,
+        ref=ref, abbrv=abbrv, chains=chains, stringsAsFactors=FALSE), counts)
+    } else if(type %in% c("DNA", "RNA")) {
+      out <- cbind(data.frame(gene=id, organism=organism,
+        ref=ref, abbrv=abbrv, chains=chains, stringsAsFactors=FALSE), counts)
+    }
   } else return(sequences)
 }
 
@@ -187,13 +177,14 @@
   return(aa)
 }
 
-count.aa <- function(seq, start=NULL, stop=NULL) {
-  # count amino acids in one or more sequences
-  # sequences are given as elements of the list seq
-  aa <- aminoacids(1)
+count.aa <- function(seq, start=NULL, stop=NULL, type="protein") {
+  # count amino acids or DNA bases in one or more sequences given as elements of the list seq
+  # put them in alphabetical order (amino acids: same order as in thermo$protein)
+  if(type=="protein") letts <- aminoacids(1)
+  else if(type=="DNA") letts <- c("A", "C", "G", "T")
+  else stop(paste("unknown sequence type", type))
   # to count the letters in each sequence
   countfun <- function(seq, start, stop) {
-    count <- numeric(20)
     # get a substring if one or both of start or stop are given
     # if only one of start or stop is given, get a default value for the other
     if(!is.null(start)) {
@@ -203,21 +194,22 @@
       seq <- substr(seq, 1, stop)
     }
     # the actual counting
-    naa <- table(strsplit(toupper(seq), "")[[1]])
-    # put them in the same order as in aa (i.e. thermo$protein)
-    iaa <- match(names(naa), aa)
-    # in case any letters don't match some amino acid
-    ina <- is.na(iaa)
-    count[iaa[!ina]] <- naa[!ina]
-    if(any(ina)) msgout("count.aa: unrecognized amino acid code(s): ", 
-      paste(names(naa)[ina], collapse=" "), "\n")
+    nnn <- table(strsplit(toupper(seq), "")[[1]])
+    # get them in alphabetical order
+    ilett <- match(names(nnn), letts)
+    # in case any letters aren't in our alphabet
+    ina <- is.na(ilett)
+    if(any(ina)) msgout(paste("count.aa: unrecognized letter(s) in", type, "sequence:",
+      paste(names(nnn)[ina], collapse=" "), "\n"))
+    count <- numeric(length(letts))
+    count[ilett[!ina]] <- nnn[!ina]
     return(count)
   }
-  # count amino acids in each sequence
+  # counts for each sequence
   a <- palply(seq, countfun, start, stop)
   a <- t(as.data.frame(a, optional=TRUE))
   # clean up row/column names
-  colnames(a) <- aa
+  colnames(a) <- letts
   rownames(a) <- 1:nrow(a)
   return(a)
 }

Modified: pkg/CHNOSZ/R/util.seq.R
===================================================================
--- pkg/CHNOSZ/R/util.seq.R	2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/R/util.seq.R	2013-06-02 05:15:42 UTC (rev 53)
@@ -31,63 +31,45 @@
   else if(nchar=="Z") return(aacharged[iaa])
 }
 
-nucleicacids <- function(seq=NULL,type="DNA",comp=NULL,comp2=NULL) {
-  # count bases or compute the formula, e.g.
-  # n <- nucleicacids(list("AGCT","TTTT"))  # a dataframe of counts
-  # f <- nucleicacids(n)  # a series of formulas
+nucleic.formula <- function(nucleic=NULL) {
+  # compute the formula, e.g.
+  # DNA <- count.aa(list("AGCT", "TTTT"), type="DNA")  # a dataframe of counts
+  # nf <- nucleic.formula(DNA)  # a series of formulas
+  # !!! this only adds the formulas of the nucleobases; dehydration and phosphorylation are not yet accounted for !!!
   # 20090926 jmd
-  if(is.null(seq)) stop("please provide a sequence")
-  if(type=="DNA") {
-    na <- c("A","C","G","T")
-    na.NA <- c("adenine","cytosine","guanine","thymine")
-  } else if(type=="RNA") {
-    na <- c("U","G","C","A")
-    na.NA <- c("uracil","guanine","cytosine","adenine")
-  } else stop(paste("invalid type:",type))
-  if(is.data.frame(seq)) {
-    # compute the chemical formula of bases
-    if(!all(na %in% colnames(seq))) {
-      nabases <- c2s(na[which(!na %in% colnames(seq))],sep=" ")
-      stop(paste("requested type is",type,"but",nabases,"is/are not in the colnames of supplied dataframe"))
-    }
-    # the formulas of each of the bases
-    f.base <- get("thermo")$obigt$formula[info(na.NA[match(colnames(seq),na)])]
-    # loop over the base counts
-    f.out <- character()
-    for(i in 1:nrow(seq)) {
-      # use makeup() with multipliers and sum=TRUE  20120119 jmd
-      f <- as.chemical.formula(makeup(f.base, multiplier=as.numeric(seq[i,]), sum=TRUE))
-      f.out <- c(f.out,f)
-    }
-    return(f.out)
-  } else {
-    # count the numbers of nucleic acid bases in a sequence
-    # sequences are given as elements of the list seq
-    # to count the number of each amino acids in a sequence
-    count.na <- function(na,seq) sum(seq==na)
-    count.i <- function(i,seq) as.numeric(lapply(na,count.na,strsplit(toupper(seq[i]),"")[[1]]))
-    # count bases in each sequence
-    n <- t(as.data.frame(palply(1:length(seq),count.i,seq),optional=TRUE))
-    n <- as.data.frame(n)
-    # clean up row/column names
-    colnames(n) <- na
-    rownames(n) <- 1:nrow(n)
-    # return the complement if requested e.g.
-    # nucleicacids(x,type,"DNA")  # DNA complement
-    # nucleicacids(x,type,"RNA")  # RNA complement
-    # nucleicacids(x,type,"DNA","RNA")  # DNA, then RNA complement
-    if(!is.null(comp)) {
-      if(comp=="DNA") colnames(n) <- c("T","G","C","A")
-      else if(comp=="RNA") colnames(n) <- c("U","G","C","A")
-      else stop(paste("invalid complement request:",comp))
-    }
-    if(!is.null(comp2)) {
-      if(comp2=="DNA") colnames(n) <- c("A","C","G","T")
-      else if(comp2=="RNA") colnames(n) <- c("A","C","G","U")
-      else stop(paste("invalid complement request:",comp))
-    }
-    return(n)
+  letts <- c("A", "C", "G", "T", "U")
+  names <- c("adenine", "cytosine", "guanine", "thymine", "uracil")
+  # the locations of the letters in the data frame
+  i.lett <- match(letts, colnames(nucleic))
+  # we'll normally have at least one NA (U or A for DNA or RNA)
+  ina <- is.na(i.lett)
+  # the species indices of the bases, in the order appearing above
+  i.base <- suppressMessages(info(names[!ina], check.it=FALSE))
+  # the chemical formula of bases
+  f.base <- get("thermo")$obigt$formula[i.base]
+  # loop over the base counts
+  f.out <- character()
+  for(i in 1:nrow(nucleic)) {
+    # use makeup() with multipliers and sum=TRUE  20120119 jmd
+    f <- as.chemical.formula(makeup(f.base, multiplier=as.numeric(nucleic[i, i.lett[!ina]]), sum=TRUE))
+    f.out <- c(f.out, f)
   }
+  return(f.out)
 }
 
-
+nucleic.complement <- function(nucleic=NULL, type="DNA") {
+  # return the nucleobase complement
+  # nucleic.complement(nucleic, "DNA")  # DNA complement
+  # nucleic.complement(nucleic, "RNA")  # RNA complement
+  # the reference sequence, and its DNA and RNA complements
+  ref <- c("A", "C", "G", "T", "U")
+  DNA <- c("T", "G", "C", "A", "A")
+  RNA <- c("U", "G", "C", "A", "A")
+  iref <- match(colnames(nucleic), ref)
+  i.base <- which(!is.na(iref))
+  colnames(nucleic)[i.base] <- get(type)[iref[i.base]]
+  # be nice and re-alphabetize the columns
+  o.base <- order(colnames(nucleic)[i.base])
+  nucleic <- nucleic[, i.base[o.base], drop=FALSE]
+  return(nucleic)
+}

Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS	2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/inst/NEWS	2013-06-02 05:15:42 UTC (rev 53)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 1.0.0-1 (2013-05-06)
+CHANGES IN CHNOSZ 1.0.0-2 (2013-06-02)
 --------------------------------------
 
 - Fix IAPWS-95 calculations for subcrt(): in water.IAPWS95(), rename
@@ -11,6 +11,11 @@
 - In water.AW90(), analytical solution to polynomial equation is
   now used to calculate dielectric constant. Thanks to Marc Neveu.
 
+- count.aa() and read.fasta() handle DNA sequences, with new argument
+  'type' (protein or DNA).
+
+- nucleicacids() split into nucleic.formula() and nucleic.complement().
+
 CHANGES IN CHNOSZ 1.0.0 (2013-03-28)
 ------------------------------------
 

Modified: pkg/CHNOSZ/inst/tests/test-util.seq.R
===================================================================
--- pkg/CHNOSZ/inst/tests/test-util.seq.R	2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/inst/tests/test-util.seq.R	2013-06-02 05:15:42 UTC (rev 53)
@@ -1,9 +1,19 @@
 context("util.seq")
 
 test_that("count.aa() warns about unrecognized amino acids and performs substring operations", {
-  expect_message(count.aa("ABCDEFGHIJ"), "count.aa: unrecognized amino acid code\\(s\\): B J")
+  expect_message(count.aa("ABCDEFGHIJ"), "count.aa: unrecognized letter\\(s\\) in protein sequence: B J")
   myseq <- "AAAAAGGGGG"
   expect_equal(count.aa(myseq, stop=5)[, "G"], 0)
   expect_equal(count.aa(myseq, start=6)[, "A"], 0)
   expect_equal(count.aa(myseq, start=5, stop=6)[, c("A", "G")], c(1, 1), check.attributes=FALSE)
 })
+
+test_that("nucleobase sequences can be processed with count.aa(), nucleic.formula() and nucleic.complement()", {
+  expect_message(dna <- count.aa("ABCDEFGHIJ", type="DNA"), "count.aa: unrecognized letter\\(s\\) in DNA sequence: B D E F H I J")
+  expect_equal(as.numeric(dna), c(1, 1, 1, 0))
+  expect_equal(nucleic.formula(dna), "C14H15N13O2")
+  # nucleobases can be in any order
+  expect_equal(nucleic.formula(dna[, 4:1, drop=FALSE]), "C14H15N13O2")
+  # ACG -> UGC (RNA complement)
+  expect_equal(nucleic.formula(nucleic.complement(dna, "RNA")), "C13H14N10O4")
+})

Modified: pkg/CHNOSZ/man/util.fasta.Rd
===================================================================
--- pkg/CHNOSZ/man/util.fasta.Rd	2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/man/util.fasta.Rd	2013-06-02 05:15:42 UTC (rev 53)
@@ -14,8 +14,8 @@
   grep.file(file, pattern = "", y = NULL, ignore.case = TRUE, 
     startswith = ">", lines = NULL, grep = "grep")
   read.fasta(file, i = NULL, ret = "count", lines = NULL, 
-    ihead = NULL, pnff = FALSE, start=NULL, stop=NULL)
-  count.aa(seq, start=NULL, stop=NULL)
+    ihead = NULL, start=NULL, stop=NULL, type="protein")
+  count.aa(seq, start=NULL, stop=NULL, type="protein")
   uniprot.aa(protein, start=NULL, stop=NULL)
 }
 
@@ -30,9 +30,9 @@
   \item{i}{numeric, line numbers of sequence headers to read}
   \item{ret}{character, specification for type of return (count, sequence, or FASTA format)}
   \item{ihead}{numeric, which lines are headers}
-  \item{pnff}{logical, get the protein name from the filename?}
   \item{start}{numeric, position in sequence to start counting}
   \item{stop}{numeric, position in sequence to stop counting}
+  \item{type}{character, sequence type (protein or DNA)}
   \item{seq}{character, amino acid sequence of a protein}
   \item{protein}{character, entry name for protein in UniProt}
 }
@@ -53,10 +53,11 @@
 If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}. 
 Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to "").
 
-\code{count.aa} counts the occurrences of each amino acid in a sequence (\code{seq}), returning a data frame with amino acids in the same order as \code{thermo$protein}.
-It is not case-sensitive.
-A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid abbreviations.
-\code{start} and/or \code{stop} can be provided to count amino acids in a fragment of the sequence (extracted using \code{\link{substr}}).
+\code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}).
+For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}.
+Letters are matched without regard for case.
+A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid or nucleobase abbreviations.
+\code{start} and/or \code{stop} can be provided to count a fragment of the sequence (extracted using \code{\link{substr}}).
 If only one of \code{start} or \code{stop} is present, the other defaults to 1 (\code{start}) or the length of the sequence (\code{stop}).
 
 \code{uniprot.aa} returns a data frame of amino acid composition, in the format of \code{thermo$protein}, retrieved from the protein sequence if it is available from UniProt (\url{http://uniprot.org}; The UniProt Consortium, 2012).
@@ -71,7 +72,7 @@
 }
 
 \seealso{
-\code{\link{nucleicacids}} for counting occurrences of bases in a DNA or RNA sequence.
+\code{\link{nucleic.formula}} for an example of counting nucleobases in a DNA sequence.
 When computing relative abundances of many proteins that might be found with \code{grep.file} and \code{read.fasta}, consider using the \code{iprotein} arugment of \code{\link{affinity}} to speed things up; for an example see the help page for \code{\link{revisit}}. 
 }
 

Modified: pkg/CHNOSZ/man/util.seq.Rd
===================================================================
--- pkg/CHNOSZ/man/util.seq.Rd	2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/man/util.seq.Rd	2013-06-02 05:15:42 UTC (rev 53)
@@ -1,9 +1,9 @@
 \name{util.seq}
 \alias{util.seq}
 \alias{aminoacids}
+\alias{nucleic.formula}
+\alias{nucleic.complement}
 
-\alias{nucleicacids}
-
 \title{Functions to Work with Sequence Data}
 
 \description{
@@ -12,38 +12,42 @@
 
 \usage{
   aminoacids(nchar=1, which=NULL)
-  nucleicacids(seq, type = "DNA", comp = NULL, comp2 = NULL)
+  nucleic.formula(nucleic = NULL)
+  nucleic.complement(nucleic = NULL, type="DNA")
 }
 
 \arguments{
   \item{nchar}{numeric, \eqn{1} to return one-letter, \eqn{3} to return three-letter abbreviations for amino acids}
   \item{which}{character, which amino acids to name}
-  \item{seq}{character, base sequence of a nucleic acid (\code{nucleicacids})}
-  \item{type}{character, type of nucleic acid sequence (DNA or RNA) (\code{nucleicads})}
-  \item{comp}{character, type of complement sequence}
-  \item{comp2}{character, type of second complement sequence}
+  \item{nucleic}{data frame, counts of nucleic-acid bases}
+  \item{type}{character, target type of nucleic acid (DNA or RNA)}
 }
 
 \details{
 
-  Depending on the value of \code{nchar}, \code{aminoacids} returns the one-letter abbreviations (\samp{1}) or the three-letter abbreviations (\samp{3}) or the names of the neutral amino acids (\samp{""}) or the names of the amino acids with ionized side chains (\samp{"Z"}). The default includes 20 amino acids in the order used in \code{thermo$protein}, unless \code{which} is provided, indicating the desired amino acids (either as 1- or 3-letter abbreviations or names of the neutral amino acids).
+\code{aminoacids} returns the one-letter abbreviations (\code{nchar}=\samp{1}) or the three-letter abbreviations (\code{nchar}=\samp{3}) or the names of the neutral amino acids (\code{nchar}=\samp{""}) or the names of the amino acids with ionized side chains (\code{nchar}=\samp{"Z"}).
+The output includes 20 amino acids in alphabetic order by 1-letter abbreviation (the order used in \code{thermo$protein}), unless \code{which} is provided, indicating the desired amino acids (either as 1- or 3-letter abbreviations or names of the neutral amino acids).
 
-  \code{aminoacids} takes a character value or list of character values containing a protein sequence. The number of occurrences of each type of amino acid is counted and output in a data frame with 20 columns, ordered in the same way as \code{thermo$protein}.
 
-  \code{nucleicacids} takes a DNA or RNA sequence and counts the numbers of bases of each type. Whether the sequence is DNA or RNA is specified by \code{type}. Setting \code{comp} to \samp{DNA} or \samp{RNA} tells the function to compute the base composition of that type of complement of the sequence. If \code{comp2} is specified, another complement is taken. The two rounds of complementing can be used in a single function call e.g. to go from a sequence on DNA minus strand (given in \code{seq}) to the plus strand (with \code{comp="DNA"}) and then from the DNA plus strand to RNA (with \code{comp2="RNA"}). The value returned by the function is a dataframe of base composition, which can be passed back to the function to obtain the overall chemical formula for the bases.
+\code{nucleic.formula} returns a string representation of the chemical formula for each nucleic-acid composition contained in \code{nucleic}.
+The names of the bases are indicated by the column names of \code{nucleic}.
+At present, the formula is computed as the sum of the chemical formulas of the bases themselves, with no contribution from polymerization (dehydration) or phosphorylation.
 
+\code{nucleic.complement} calculates the complement of the base composition given in \code{nucleic}. 
+\code{type} specifies the type of nucleic acid of the complement - \samp{DNA} (A, G, C, T) or \samp{RNA} (A, G, C, U).
 }
 
+\seealso{\code{\link{count.aa}} for counting amino acids or nucleic-acid bases in a sequence; \code{\link{protein.formula}} for calculating the chemical formulas of proteins.}
+
 \examples{\dontshow{data(thermo)}
 ## count nucleobases in a sequence
-nucleicacids("ACCGGGTTT")
+bases <- count.aa("ACCGGGTTT", type="DNA")
 # the DNA complement of that sequence
-nucleicacids("ACCGGGTTT", comp="DNA")
+DNA.comp <- nucleic.complement(bases)
 # the RNA complement of the DNA complement
-n <- nucleicacids("ACCGGGTTT", comp="DNA", comp2="RNA")
-# the formula of the RNA complement
-(nf <- nucleicacids(n, type="RNA"))
-stopifnot(identical(nf, "C40H42N32O11"))
+RNA.comp <- nucleic.complement(DNA.comp, type="RNA")
+# the formula of the RNA complement (bases only)
+nucleic.formula(RNA.comp)  # C40H42N32O11
 }
 
 \keyword{util}



More information about the CHNOSZ-commits mailing list