[CHNOSZ-commits] r53 - in pkg/CHNOSZ: . R inst inst/tests man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sun Jun 2 07:15:42 CEST 2013
Author: jedick
Date: 2013-06-02 07:15:42 +0200 (Sun, 02 Jun 2013)
New Revision: 53
Modified:
pkg/CHNOSZ/DESCRIPTION
pkg/CHNOSZ/R/protein.info.R
pkg/CHNOSZ/R/util.fasta.R
pkg/CHNOSZ/R/util.seq.R
pkg/CHNOSZ/inst/NEWS
pkg/CHNOSZ/inst/tests/test-util.seq.R
pkg/CHNOSZ/man/util.fasta.Rd
pkg/CHNOSZ/man/util.seq.Rd
Log:
count.aa() and read.fasta() handle DNA sequences
Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================
--- pkg/CHNOSZ/DESCRIPTION 2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/DESCRIPTION 2013-06-02 05:15:42 UTC (rev 53)
@@ -1,6 +1,6 @@
-Date: 2013-05-06
+Date: 2013-06-02
Package: CHNOSZ
-Version: 1.0.0-1
+Version: 1.0.0-2
Title: Chemical Thermodynamics and Activity Diagrams
Author: Jeffrey M. Dick
Maintainer: Jeffrey M. Dick <j3ffdick at gmail.com>
Modified: pkg/CHNOSZ/R/protein.info.R
===================================================================
--- pkg/CHNOSZ/R/protein.info.R 2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/R/protein.info.R 2013-06-02 05:15:42 UTC (rev 53)
@@ -79,8 +79,8 @@
protein.length <- function(protein, organism=NULL) {
# calculate the length(s) of proteins
aa <- ip2aa(protein, organism)
- # loop over the proteins
- pl <- sapply(1:nrow(aa), function(i) sum(aa[i, 6:25]))
+ # use rowSums on the columns containing amino acid counts
+ pl <- as.numeric(rowSums(aa[, 6:25]))
return(pl)
}
Modified: pkg/CHNOSZ/R/util.fasta.R
===================================================================
--- pkg/CHNOSZ/R/util.fasta.R 2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/R/util.fasta.R 2013-06-02 05:15:42 UTC (rev 53)
@@ -45,22 +45,19 @@
}
read.fasta <- function(file, i=NULL, ret="count", lines=NULL, ihead=NULL,
- pnff=FALSE, start=NULL, stop=NULL) {
+ start=NULL, stop=NULL, type="protein") {
# read sequences from a fasta file
- if(file != "") msgout("read.fasta: reading ",basename(file),"\n")
- # all of them or only those indicated by i
- # if aa=TRUE compile a data frame of the amino acid
- # compositions suitable for add.protein
# some of the following code was adapted from
# read.fasta in package seqinR
- # TODO: better test for type of system
+ # value of 'i' is what sequences to read
# value of 'ret' determines format of return value:
- # aa: amino acid composition (same columns as thermo$protein)
+ # count: amino acid composition (same columns as thermo$protein, can be used by add.protein)
+ # or nucleic acid base composition (A-C-G-T)
# seq: amino acid sequence
# fas: fasta entry
- # pnff: take the protein name from filename (TRUE) or entry name (FALSE)
is.nix <- Sys.info()[[1]]=="Linux"
if(is.nix & is.null(lines)) {
+ msgout("read.fasta: reading ",basename(file),"\n")
# figure out whether to use 'cat', 'zcat' or 'xzcat'
suffix <- substr(file,nchar(file)-2,nchar(file))
if(suffix==".gz") mycat <- "zcat"
@@ -72,7 +69,10 @@
lines <- system(paste(mycat,' "',file,'"',sep=""),intern=TRUE)
linefun <- function(i1,i2) lines[i1:i2]
} else {
- if(is.null(lines)) lines <- readLines(file)
+ if(is.null(lines)) {
+ lines <- readLines(file)
+ msgout("read.fasta: reading ",basename(file),"\n")
+ }
nlines <- length(lines)
if(is.null(ihead)) ihead <- which(substr(lines,1,1)==">")
linefun <- function(i1,i2) lines[i1:i2]
@@ -97,52 +97,42 @@
for(i in 1:length(begin)) iline <- c(iline,(begin[i]-1):end[i])
return(lines[iline])
}
- # get each sequences from the begin to end lines
+ # get each sequence from the begin to end lines
seqfun <- function(i) paste(linefun(begin[i],end[i]),collapse="")
sequences <- lapply(1:length(i), seqfun)
- # process the header line for each entry
- # (strip the ">" and go to the first space or underscore)
- nomfun <- function(befund) {
- nomnomfun <- function(j,pnff) {
- # get the text of the line
- f1 <- linefun(i[j],i[j])
- # stop if the first character is not ">"
- # or the first two charaters are "> "
- if(substr(f1,1,1)!=">" | length(grep("^> ",f1)>0))
- stop(paste("file",basename(file),"line",j,"doesn't begin with FASTA header '>'."))
- # discard the leading '>'
- f2 <- substr(f1, 2, nchar(f1))
- # keep everything before the first space
- f3 <- strsplit(f2," ")[[1]][1]
- # then before or after the first underscore
- if(befund) f4 <- strsplit(f3,"_")[[1]][1]
- else f4 <- strsplit(f3,"_")[[1]][2]
- return(f4)
- }
- noms <- as.character(palply(1:length(i),nomnomfun))
- return(noms)
- }
- # process the file name
+ # organism name is from file name
# (basename minus extension)
bnf <- strsplit(basename(file),split=".",fixed=TRUE)[[1]][1]
- if(pnff) {
- # protein name is from file name
- # organism name is from entry
- protein <- bnf
- organism <- nomfun(befund=FALSE)
- } else {
- # vice versa
- protein <- nomfun(befund=TRUE)
- organism <- bnf
- }
+ organism <- bnf
+ # protein/gene name is from header line for entry
+ # (strip the ">" and go to the first space or underscore)
+ id <- as.character(palply(1:length(i), function(j) {
+ # get the text of the line
+ f1 <- linefun(i[j],i[j])
+ # stop if the first character is not ">"
+ # or the first two charaters are "> "
+ if(substr(f1,1,1)!=">" | length(grep("^> ",f1)>0))
+ stop(paste("file",basename(file),"line",j,"doesn't begin with FASTA header '>'."))
+ # discard the leading '>'
+ f2 <- substr(f1, 2, nchar(f1))
+ # keep everything before the first space
+ f3 <- strsplit(f2," ")[[1]][1]
+ # then before or after the first underscore
+ return(strsplit(f3,"_")[[1]][1])
+ } ))
if(ret=="count") {
- aa <- count.aa(sequences, start, stop)
- colnames(aa) <- aminoacids(3)
+ counts <- count.aa(sequences, start, stop, type)
ref <- abbrv <- NA
chains <- 1
- # 20090507 made stringsAsFactors FALSE
- return(cbind(data.frame(protein=protein,organism=organism,
- ref=ref,abbrv=abbrv,chains=chains,stringsAsFactors=FALSE),aa))
+ if(type=="protein") {
+ colnames(counts) <- aminoacids(3)
+ # 20090507 made stringsAsFactors FALSE
+ out <- cbind(data.frame(protein=id, organism=organism,
+ ref=ref, abbrv=abbrv, chains=chains, stringsAsFactors=FALSE), counts)
+ } else if(type %in% c("DNA", "RNA")) {
+ out <- cbind(data.frame(gene=id, organism=organism,
+ ref=ref, abbrv=abbrv, chains=chains, stringsAsFactors=FALSE), counts)
+ }
} else return(sequences)
}
@@ -187,13 +177,14 @@
return(aa)
}
-count.aa <- function(seq, start=NULL, stop=NULL) {
- # count amino acids in one or more sequences
- # sequences are given as elements of the list seq
- aa <- aminoacids(1)
+count.aa <- function(seq, start=NULL, stop=NULL, type="protein") {
+ # count amino acids or DNA bases in one or more sequences given as elements of the list seq
+ # put them in alphabetical order (amino acids: same order as in thermo$protein)
+ if(type=="protein") letts <- aminoacids(1)
+ else if(type=="DNA") letts <- c("A", "C", "G", "T")
+ else stop(paste("unknown sequence type", type))
# to count the letters in each sequence
countfun <- function(seq, start, stop) {
- count <- numeric(20)
# get a substring if one or both of start or stop are given
# if only one of start or stop is given, get a default value for the other
if(!is.null(start)) {
@@ -203,21 +194,22 @@
seq <- substr(seq, 1, stop)
}
# the actual counting
- naa <- table(strsplit(toupper(seq), "")[[1]])
- # put them in the same order as in aa (i.e. thermo$protein)
- iaa <- match(names(naa), aa)
- # in case any letters don't match some amino acid
- ina <- is.na(iaa)
- count[iaa[!ina]] <- naa[!ina]
- if(any(ina)) msgout("count.aa: unrecognized amino acid code(s): ",
- paste(names(naa)[ina], collapse=" "), "\n")
+ nnn <- table(strsplit(toupper(seq), "")[[1]])
+ # get them in alphabetical order
+ ilett <- match(names(nnn), letts)
+ # in case any letters aren't in our alphabet
+ ina <- is.na(ilett)
+ if(any(ina)) msgout(paste("count.aa: unrecognized letter(s) in", type, "sequence:",
+ paste(names(nnn)[ina], collapse=" "), "\n"))
+ count <- numeric(length(letts))
+ count[ilett[!ina]] <- nnn[!ina]
return(count)
}
- # count amino acids in each sequence
+ # counts for each sequence
a <- palply(seq, countfun, start, stop)
a <- t(as.data.frame(a, optional=TRUE))
# clean up row/column names
- colnames(a) <- aa
+ colnames(a) <- letts
rownames(a) <- 1:nrow(a)
return(a)
}
Modified: pkg/CHNOSZ/R/util.seq.R
===================================================================
--- pkg/CHNOSZ/R/util.seq.R 2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/R/util.seq.R 2013-06-02 05:15:42 UTC (rev 53)
@@ -31,63 +31,45 @@
else if(nchar=="Z") return(aacharged[iaa])
}
-nucleicacids <- function(seq=NULL,type="DNA",comp=NULL,comp2=NULL) {
- # count bases or compute the formula, e.g.
- # n <- nucleicacids(list("AGCT","TTTT")) # a dataframe of counts
- # f <- nucleicacids(n) # a series of formulas
+nucleic.formula <- function(nucleic=NULL) {
+ # compute the formula, e.g.
+ # DNA <- count.aa(list("AGCT", "TTTT"), type="DNA") # a dataframe of counts
+ # nf <- nucleic.formula(DNA) # a series of formulas
+ # !!! this only adds the formulas of the nucleobases; dehydration and phosphorylation are not yet accounted for !!!
# 20090926 jmd
- if(is.null(seq)) stop("please provide a sequence")
- if(type=="DNA") {
- na <- c("A","C","G","T")
- na.NA <- c("adenine","cytosine","guanine","thymine")
- } else if(type=="RNA") {
- na <- c("U","G","C","A")
- na.NA <- c("uracil","guanine","cytosine","adenine")
- } else stop(paste("invalid type:",type))
- if(is.data.frame(seq)) {
- # compute the chemical formula of bases
- if(!all(na %in% colnames(seq))) {
- nabases <- c2s(na[which(!na %in% colnames(seq))],sep=" ")
- stop(paste("requested type is",type,"but",nabases,"is/are not in the colnames of supplied dataframe"))
- }
- # the formulas of each of the bases
- f.base <- get("thermo")$obigt$formula[info(na.NA[match(colnames(seq),na)])]
- # loop over the base counts
- f.out <- character()
- for(i in 1:nrow(seq)) {
- # use makeup() with multipliers and sum=TRUE 20120119 jmd
- f <- as.chemical.formula(makeup(f.base, multiplier=as.numeric(seq[i,]), sum=TRUE))
- f.out <- c(f.out,f)
- }
- return(f.out)
- } else {
- # count the numbers of nucleic acid bases in a sequence
- # sequences are given as elements of the list seq
- # to count the number of each amino acids in a sequence
- count.na <- function(na,seq) sum(seq==na)
- count.i <- function(i,seq) as.numeric(lapply(na,count.na,strsplit(toupper(seq[i]),"")[[1]]))
- # count bases in each sequence
- n <- t(as.data.frame(palply(1:length(seq),count.i,seq),optional=TRUE))
- n <- as.data.frame(n)
- # clean up row/column names
- colnames(n) <- na
- rownames(n) <- 1:nrow(n)
- # return the complement if requested e.g.
- # nucleicacids(x,type,"DNA") # DNA complement
- # nucleicacids(x,type,"RNA") # RNA complement
- # nucleicacids(x,type,"DNA","RNA") # DNA, then RNA complement
- if(!is.null(comp)) {
- if(comp=="DNA") colnames(n) <- c("T","G","C","A")
- else if(comp=="RNA") colnames(n) <- c("U","G","C","A")
- else stop(paste("invalid complement request:",comp))
- }
- if(!is.null(comp2)) {
- if(comp2=="DNA") colnames(n) <- c("A","C","G","T")
- else if(comp2=="RNA") colnames(n) <- c("A","C","G","U")
- else stop(paste("invalid complement request:",comp))
- }
- return(n)
+ letts <- c("A", "C", "G", "T", "U")
+ names <- c("adenine", "cytosine", "guanine", "thymine", "uracil")
+ # the locations of the letters in the data frame
+ i.lett <- match(letts, colnames(nucleic))
+ # we'll normally have at least one NA (U or A for DNA or RNA)
+ ina <- is.na(i.lett)
+ # the species indices of the bases, in the order appearing above
+ i.base <- suppressMessages(info(names[!ina], check.it=FALSE))
+ # the chemical formula of bases
+ f.base <- get("thermo")$obigt$formula[i.base]
+ # loop over the base counts
+ f.out <- character()
+ for(i in 1:nrow(nucleic)) {
+ # use makeup() with multipliers and sum=TRUE 20120119 jmd
+ f <- as.chemical.formula(makeup(f.base, multiplier=as.numeric(nucleic[i, i.lett[!ina]]), sum=TRUE))
+ f.out <- c(f.out, f)
}
+ return(f.out)
}
-
+nucleic.complement <- function(nucleic=NULL, type="DNA") {
+ # return the nucleobase complement
+ # nucleic.complement(nucleic, "DNA") # DNA complement
+ # nucleic.complement(nucleic, "RNA") # RNA complement
+ # the reference sequence, and its DNA and RNA complements
+ ref <- c("A", "C", "G", "T", "U")
+ DNA <- c("T", "G", "C", "A", "A")
+ RNA <- c("U", "G", "C", "A", "A")
+ iref <- match(colnames(nucleic), ref)
+ i.base <- which(!is.na(iref))
+ colnames(nucleic)[i.base] <- get(type)[iref[i.base]]
+ # be nice and re-alphabetize the columns
+ o.base <- order(colnames(nucleic)[i.base])
+ nucleic <- nucleic[, i.base[o.base], drop=FALSE]
+ return(nucleic)
+}
Modified: pkg/CHNOSZ/inst/NEWS
===================================================================
--- pkg/CHNOSZ/inst/NEWS 2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/inst/NEWS 2013-06-02 05:15:42 UTC (rev 53)
@@ -1,4 +1,4 @@
-CHANGES IN CHNOSZ 1.0.0-1 (2013-05-06)
+CHANGES IN CHNOSZ 1.0.0-2 (2013-06-02)
--------------------------------------
- Fix IAPWS-95 calculations for subcrt(): in water.IAPWS95(), rename
@@ -11,6 +11,11 @@
- In water.AW90(), analytical solution to polynomial equation is
now used to calculate dielectric constant. Thanks to Marc Neveu.
+- count.aa() and read.fasta() handle DNA sequences, with new argument
+ 'type' (protein or DNA).
+
+- nucleicacids() split into nucleic.formula() and nucleic.complement().
+
CHANGES IN CHNOSZ 1.0.0 (2013-03-28)
------------------------------------
Modified: pkg/CHNOSZ/inst/tests/test-util.seq.R
===================================================================
--- pkg/CHNOSZ/inst/tests/test-util.seq.R 2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/inst/tests/test-util.seq.R 2013-06-02 05:15:42 UTC (rev 53)
@@ -1,9 +1,19 @@
context("util.seq")
test_that("count.aa() warns about unrecognized amino acids and performs substring operations", {
- expect_message(count.aa("ABCDEFGHIJ"), "count.aa: unrecognized amino acid code\\(s\\): B J")
+ expect_message(count.aa("ABCDEFGHIJ"), "count.aa: unrecognized letter\\(s\\) in protein sequence: B J")
myseq <- "AAAAAGGGGG"
expect_equal(count.aa(myseq, stop=5)[, "G"], 0)
expect_equal(count.aa(myseq, start=6)[, "A"], 0)
expect_equal(count.aa(myseq, start=5, stop=6)[, c("A", "G")], c(1, 1), check.attributes=FALSE)
})
+
+test_that("nucleobase sequences can be processed with count.aa(), nucleic.formula() and nucleic.complement()", {
+ expect_message(dna <- count.aa("ABCDEFGHIJ", type="DNA"), "count.aa: unrecognized letter\\(s\\) in DNA sequence: B D E F H I J")
+ expect_equal(as.numeric(dna), c(1, 1, 1, 0))
+ expect_equal(nucleic.formula(dna), "C14H15N13O2")
+ # nucleobases can be in any order
+ expect_equal(nucleic.formula(dna[, 4:1, drop=FALSE]), "C14H15N13O2")
+ # ACG -> UGC (RNA complement)
+ expect_equal(nucleic.formula(nucleic.complement(dna, "RNA")), "C13H14N10O4")
+})
Modified: pkg/CHNOSZ/man/util.fasta.Rd
===================================================================
--- pkg/CHNOSZ/man/util.fasta.Rd 2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/man/util.fasta.Rd 2013-06-02 05:15:42 UTC (rev 53)
@@ -14,8 +14,8 @@
grep.file(file, pattern = "", y = NULL, ignore.case = TRUE,
startswith = ">", lines = NULL, grep = "grep")
read.fasta(file, i = NULL, ret = "count", lines = NULL,
- ihead = NULL, pnff = FALSE, start=NULL, stop=NULL)
- count.aa(seq, start=NULL, stop=NULL)
+ ihead = NULL, start=NULL, stop=NULL, type="protein")
+ count.aa(seq, start=NULL, stop=NULL, type="protein")
uniprot.aa(protein, start=NULL, stop=NULL)
}
@@ -30,9 +30,9 @@
\item{i}{numeric, line numbers of sequence headers to read}
\item{ret}{character, specification for type of return (count, sequence, or FASTA format)}
\item{ihead}{numeric, which lines are headers}
- \item{pnff}{logical, get the protein name from the filename?}
\item{start}{numeric, position in sequence to start counting}
\item{stop}{numeric, position in sequence to stop counting}
+ \item{type}{character, sequence type (protein or DNA)}
\item{seq}{character, amino acid sequence of a protein}
\item{protein}{character, entry name for protein in UniProt}
}
@@ -53,10 +53,11 @@
If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}.
Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to "").
-\code{count.aa} counts the occurrences of each amino acid in a sequence (\code{seq}), returning a data frame with amino acids in the same order as \code{thermo$protein}.
-It is not case-sensitive.
-A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid abbreviations.
-\code{start} and/or \code{stop} can be provided to count amino acids in a fragment of the sequence (extracted using \code{\link{substr}}).
+\code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}).
+For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}.
+Letters are matched without regard for case.
+A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid or nucleobase abbreviations.
+\code{start} and/or \code{stop} can be provided to count a fragment of the sequence (extracted using \code{\link{substr}}).
If only one of \code{start} or \code{stop} is present, the other defaults to 1 (\code{start}) or the length of the sequence (\code{stop}).
\code{uniprot.aa} returns a data frame of amino acid composition, in the format of \code{thermo$protein}, retrieved from the protein sequence if it is available from UniProt (\url{http://uniprot.org}; The UniProt Consortium, 2012).
@@ -71,7 +72,7 @@
}
\seealso{
-\code{\link{nucleicacids}} for counting occurrences of bases in a DNA or RNA sequence.
+\code{\link{nucleic.formula}} for an example of counting nucleobases in a DNA sequence.
When computing relative abundances of many proteins that might be found with \code{grep.file} and \code{read.fasta}, consider using the \code{iprotein} arugment of \code{\link{affinity}} to speed things up; for an example see the help page for \code{\link{revisit}}.
}
Modified: pkg/CHNOSZ/man/util.seq.Rd
===================================================================
--- pkg/CHNOSZ/man/util.seq.Rd 2013-05-06 14:35:56 UTC (rev 52)
+++ pkg/CHNOSZ/man/util.seq.Rd 2013-06-02 05:15:42 UTC (rev 53)
@@ -1,9 +1,9 @@
\name{util.seq}
\alias{util.seq}
\alias{aminoacids}
+\alias{nucleic.formula}
+\alias{nucleic.complement}
-\alias{nucleicacids}
-
\title{Functions to Work with Sequence Data}
\description{
@@ -12,38 +12,42 @@
\usage{
aminoacids(nchar=1, which=NULL)
- nucleicacids(seq, type = "DNA", comp = NULL, comp2 = NULL)
+ nucleic.formula(nucleic = NULL)
+ nucleic.complement(nucleic = NULL, type="DNA")
}
\arguments{
\item{nchar}{numeric, \eqn{1} to return one-letter, \eqn{3} to return three-letter abbreviations for amino acids}
\item{which}{character, which amino acids to name}
- \item{seq}{character, base sequence of a nucleic acid (\code{nucleicacids})}
- \item{type}{character, type of nucleic acid sequence (DNA or RNA) (\code{nucleicads})}
- \item{comp}{character, type of complement sequence}
- \item{comp2}{character, type of second complement sequence}
+ \item{nucleic}{data frame, counts of nucleic-acid bases}
+ \item{type}{character, target type of nucleic acid (DNA or RNA)}
}
\details{
- Depending on the value of \code{nchar}, \code{aminoacids} returns the one-letter abbreviations (\samp{1}) or the three-letter abbreviations (\samp{3}) or the names of the neutral amino acids (\samp{""}) or the names of the amino acids with ionized side chains (\samp{"Z"}). The default includes 20 amino acids in the order used in \code{thermo$protein}, unless \code{which} is provided, indicating the desired amino acids (either as 1- or 3-letter abbreviations or names of the neutral amino acids).
+\code{aminoacids} returns the one-letter abbreviations (\code{nchar}=\samp{1}) or the three-letter abbreviations (\code{nchar}=\samp{3}) or the names of the neutral amino acids (\code{nchar}=\samp{""}) or the names of the amino acids with ionized side chains (\code{nchar}=\samp{"Z"}).
+The output includes 20 amino acids in alphabetic order by 1-letter abbreviation (the order used in \code{thermo$protein}), unless \code{which} is provided, indicating the desired amino acids (either as 1- or 3-letter abbreviations or names of the neutral amino acids).
- \code{aminoacids} takes a character value or list of character values containing a protein sequence. The number of occurrences of each type of amino acid is counted and output in a data frame with 20 columns, ordered in the same way as \code{thermo$protein}.
- \code{nucleicacids} takes a DNA or RNA sequence and counts the numbers of bases of each type. Whether the sequence is DNA or RNA is specified by \code{type}. Setting \code{comp} to \samp{DNA} or \samp{RNA} tells the function to compute the base composition of that type of complement of the sequence. If \code{comp2} is specified, another complement is taken. The two rounds of complementing can be used in a single function call e.g. to go from a sequence on DNA minus strand (given in \code{seq}) to the plus strand (with \code{comp="DNA"}) and then from the DNA plus strand to RNA (with \code{comp2="RNA"}). The value returned by the function is a dataframe of base composition, which can be passed back to the function to obtain the overall chemical formula for the bases.
+\code{nucleic.formula} returns a string representation of the chemical formula for each nucleic-acid composition contained in \code{nucleic}.
+The names of the bases are indicated by the column names of \code{nucleic}.
+At present, the formula is computed as the sum of the chemical formulas of the bases themselves, with no contribution from polymerization (dehydration) or phosphorylation.
+\code{nucleic.complement} calculates the complement of the base composition given in \code{nucleic}.
+\code{type} specifies the type of nucleic acid of the complement - \samp{DNA} (A, G, C, T) or \samp{RNA} (A, G, C, U).
}
+\seealso{\code{\link{count.aa}} for counting amino acids or nucleic-acid bases in a sequence; \code{\link{protein.formula}} for calculating the chemical formulas of proteins.}
+
\examples{\dontshow{data(thermo)}
## count nucleobases in a sequence
-nucleicacids("ACCGGGTTT")
+bases <- count.aa("ACCGGGTTT", type="DNA")
# the DNA complement of that sequence
-nucleicacids("ACCGGGTTT", comp="DNA")
+DNA.comp <- nucleic.complement(bases)
# the RNA complement of the DNA complement
-n <- nucleicacids("ACCGGGTTT", comp="DNA", comp2="RNA")
-# the formula of the RNA complement
-(nf <- nucleicacids(n, type="RNA"))
-stopifnot(identical(nf, "C40H42N32O11"))
+RNA.comp <- nucleic.complement(DNA.comp, type="RNA")
+# the formula of the RNA complement (bases only)
+nucleic.formula(RNA.comp) # C40H42N32O11
}
\keyword{util}
More information about the CHNOSZ-commits
mailing list