From noreply at r-forge.r-project.org Sun Jun 2 07:15:42 2013 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 2 Jun 2013 07:15:42 +0200 (CEST) Subject: [CHNOSZ-commits] r53 - in pkg/CHNOSZ: . R inst inst/tests man Message-ID: <20130602051542.BE0FB1841F3@r-forge.r-project.org> Author: jedick Date: 2013-06-02 07:15:42 +0200 (Sun, 02 Jun 2013) New Revision: 53 Modified: pkg/CHNOSZ/DESCRIPTION pkg/CHNOSZ/R/protein.info.R pkg/CHNOSZ/R/util.fasta.R pkg/CHNOSZ/R/util.seq.R pkg/CHNOSZ/inst/NEWS pkg/CHNOSZ/inst/tests/test-util.seq.R pkg/CHNOSZ/man/util.fasta.Rd pkg/CHNOSZ/man/util.seq.Rd Log: count.aa() and read.fasta() handle DNA sequences Modified: pkg/CHNOSZ/DESCRIPTION =================================================================== --- pkg/CHNOSZ/DESCRIPTION 2013-05-06 14:35:56 UTC (rev 52) +++ pkg/CHNOSZ/DESCRIPTION 2013-06-02 05:15:42 UTC (rev 53) @@ -1,6 +1,6 @@ -Date: 2013-05-06 +Date: 2013-06-02 Package: CHNOSZ -Version: 1.0.0-1 +Version: 1.0.0-2 Title: Chemical Thermodynamics and Activity Diagrams Author: Jeffrey M. Dick Maintainer: Jeffrey M. Dick Modified: pkg/CHNOSZ/R/protein.info.R =================================================================== --- pkg/CHNOSZ/R/protein.info.R 2013-05-06 14:35:56 UTC (rev 52) +++ pkg/CHNOSZ/R/protein.info.R 2013-06-02 05:15:42 UTC (rev 53) @@ -79,8 +79,8 @@ protein.length <- function(protein, organism=NULL) { # calculate the length(s) of proteins aa <- ip2aa(protein, organism) - # loop over the proteins - pl <- sapply(1:nrow(aa), function(i) sum(aa[i, 6:25])) + # use rowSums on the columns containing amino acid counts + pl <- as.numeric(rowSums(aa[, 6:25])) return(pl) } Modified: pkg/CHNOSZ/R/util.fasta.R =================================================================== --- pkg/CHNOSZ/R/util.fasta.R 2013-05-06 14:35:56 UTC (rev 52) +++ pkg/CHNOSZ/R/util.fasta.R 2013-06-02 05:15:42 UTC (rev 53) @@ -45,22 +45,19 @@ } read.fasta <- function(file, i=NULL, ret="count", lines=NULL, ihead=NULL, - pnff=FALSE, start=NULL, stop=NULL) { + start=NULL, stop=NULL, type="protein") { # read sequences from a fasta file - if(file != "") msgout("read.fasta: reading ",basename(file),"\n") - # all of them or only those indicated by i - # if aa=TRUE compile a data frame of the amino acid - # compositions suitable for add.protein # some of the following code was adapted from # read.fasta in package seqinR - # TODO: better test for type of system + # value of 'i' is what sequences to read # value of 'ret' determines format of return value: - # aa: amino acid composition (same columns as thermo$protein) + # count: amino acid composition (same columns as thermo$protein, can be used by add.protein) + # or nucleic acid base composition (A-C-G-T) # seq: amino acid sequence # fas: fasta entry - # pnff: take the protein name from filename (TRUE) or entry name (FALSE) is.nix <- Sys.info()[[1]]=="Linux" if(is.nix & is.null(lines)) { + msgout("read.fasta: reading ",basename(file),"\n") # figure out whether to use 'cat', 'zcat' or 'xzcat' suffix <- substr(file,nchar(file)-2,nchar(file)) if(suffix==".gz") mycat <- "zcat" @@ -72,7 +69,10 @@ lines <- system(paste(mycat,' "',file,'"',sep=""),intern=TRUE) linefun <- function(i1,i2) lines[i1:i2] } else { - if(is.null(lines)) lines <- readLines(file) + if(is.null(lines)) { + lines <- readLines(file) + msgout("read.fasta: reading ",basename(file),"\n") + } nlines <- length(lines) if(is.null(ihead)) ihead <- which(substr(lines,1,1)==">") linefun <- function(i1,i2) lines[i1:i2] @@ -97,52 +97,42 @@ for(i in 1:length(begin)) iline <- c(iline,(begin[i]-1):end[i]) return(lines[iline]) } - # get each sequences from the begin to end lines + # get each sequence from the begin to end lines seqfun <- function(i) paste(linefun(begin[i],end[i]),collapse="") sequences <- lapply(1:length(i), seqfun) - # process the header line for each entry - # (strip the ">" and go to the first space or underscore) - nomfun <- function(befund) { - nomnomfun <- function(j,pnff) { - # get the text of the line - f1 <- linefun(i[j],i[j]) - # stop if the first character is not ">" - # or the first two charaters are "> " - if(substr(f1,1,1)!=">" | length(grep("^> ",f1)>0)) - stop(paste("file",basename(file),"line",j,"doesn't begin with FASTA header '>'.")) - # discard the leading '>' - f2 <- substr(f1, 2, nchar(f1)) - # keep everything before the first space - f3 <- strsplit(f2," ")[[1]][1] - # then before or after the first underscore - if(befund) f4 <- strsplit(f3,"_")[[1]][1] - else f4 <- strsplit(f3,"_")[[1]][2] - return(f4) - } - noms <- as.character(palply(1:length(i),nomnomfun)) - return(noms) - } - # process the file name + # organism name is from file name # (basename minus extension) bnf <- strsplit(basename(file),split=".",fixed=TRUE)[[1]][1] - if(pnff) { - # protein name is from file name - # organism name is from entry - protein <- bnf - organism <- nomfun(befund=FALSE) - } else { - # vice versa - protein <- nomfun(befund=TRUE) - organism <- bnf - } + organism <- bnf + # protein/gene name is from header line for entry + # (strip the ">" and go to the first space or underscore) + id <- as.character(palply(1:length(i), function(j) { + # get the text of the line + f1 <- linefun(i[j],i[j]) + # stop if the first character is not ">" + # or the first two charaters are "> " + if(substr(f1,1,1)!=">" | length(grep("^> ",f1)>0)) + stop(paste("file",basename(file),"line",j,"doesn't begin with FASTA header '>'.")) + # discard the leading '>' + f2 <- substr(f1, 2, nchar(f1)) + # keep everything before the first space + f3 <- strsplit(f2," ")[[1]][1] + # then before or after the first underscore + return(strsplit(f3,"_")[[1]][1]) + } )) if(ret=="count") { - aa <- count.aa(sequences, start, stop) - colnames(aa) <- aminoacids(3) + counts <- count.aa(sequences, start, stop, type) ref <- abbrv <- NA chains <- 1 - # 20090507 made stringsAsFactors FALSE - return(cbind(data.frame(protein=protein,organism=organism, - ref=ref,abbrv=abbrv,chains=chains,stringsAsFactors=FALSE),aa)) + if(type=="protein") { + colnames(counts) <- aminoacids(3) + # 20090507 made stringsAsFactors FALSE + out <- cbind(data.frame(protein=id, organism=organism, + ref=ref, abbrv=abbrv, chains=chains, stringsAsFactors=FALSE), counts) + } else if(type %in% c("DNA", "RNA")) { + out <- cbind(data.frame(gene=id, organism=organism, + ref=ref, abbrv=abbrv, chains=chains, stringsAsFactors=FALSE), counts) + } } else return(sequences) } @@ -187,13 +177,14 @@ return(aa) } -count.aa <- function(seq, start=NULL, stop=NULL) { - # count amino acids in one or more sequences - # sequences are given as elements of the list seq - aa <- aminoacids(1) +count.aa <- function(seq, start=NULL, stop=NULL, type="protein") { + # count amino acids or DNA bases in one or more sequences given as elements of the list seq + # put them in alphabetical order (amino acids: same order as in thermo$protein) + if(type=="protein") letts <- aminoacids(1) + else if(type=="DNA") letts <- c("A", "C", "G", "T") + else stop(paste("unknown sequence type", type)) # to count the letters in each sequence countfun <- function(seq, start, stop) { - count <- numeric(20) # get a substring if one or both of start or stop are given # if only one of start or stop is given, get a default value for the other if(!is.null(start)) { @@ -203,21 +194,22 @@ seq <- substr(seq, 1, stop) } # the actual counting - naa <- table(strsplit(toupper(seq), "")[[1]]) - # put them in the same order as in aa (i.e. thermo$protein) - iaa <- match(names(naa), aa) - # in case any letters don't match some amino acid - ina <- is.na(iaa) - count[iaa[!ina]] <- naa[!ina] - if(any(ina)) msgout("count.aa: unrecognized amino acid code(s): ", - paste(names(naa)[ina], collapse=" "), "\n") + nnn <- table(strsplit(toupper(seq), "")[[1]]) + # get them in alphabetical order + ilett <- match(names(nnn), letts) + # in case any letters aren't in our alphabet + ina <- is.na(ilett) + if(any(ina)) msgout(paste("count.aa: unrecognized letter(s) in", type, "sequence:", + paste(names(nnn)[ina], collapse=" "), "\n")) + count <- numeric(length(letts)) + count[ilett[!ina]] <- nnn[!ina] return(count) } - # count amino acids in each sequence + # counts for each sequence a <- palply(seq, countfun, start, stop) a <- t(as.data.frame(a, optional=TRUE)) # clean up row/column names - colnames(a) <- aa + colnames(a) <- letts rownames(a) <- 1:nrow(a) return(a) } Modified: pkg/CHNOSZ/R/util.seq.R =================================================================== --- pkg/CHNOSZ/R/util.seq.R 2013-05-06 14:35:56 UTC (rev 52) +++ pkg/CHNOSZ/R/util.seq.R 2013-06-02 05:15:42 UTC (rev 53) @@ -31,63 +31,45 @@ else if(nchar=="Z") return(aacharged[iaa]) } -nucleicacids <- function(seq=NULL,type="DNA",comp=NULL,comp2=NULL) { - # count bases or compute the formula, e.g. - # n <- nucleicacids(list("AGCT","TTTT")) # a dataframe of counts - # f <- nucleicacids(n) # a series of formulas +nucleic.formula <- function(nucleic=NULL) { + # compute the formula, e.g. + # DNA <- count.aa(list("AGCT", "TTTT"), type="DNA") # a dataframe of counts + # nf <- nucleic.formula(DNA) # a series of formulas + # !!! this only adds the formulas of the nucleobases; dehydration and phosphorylation are not yet accounted for !!! # 20090926 jmd - if(is.null(seq)) stop("please provide a sequence") - if(type=="DNA") { - na <- c("A","C","G","T") - na.NA <- c("adenine","cytosine","guanine","thymine") - } else if(type=="RNA") { - na <- c("U","G","C","A") - na.NA <- c("uracil","guanine","cytosine","adenine") - } else stop(paste("invalid type:",type)) - if(is.data.frame(seq)) { - # compute the chemical formula of bases - if(!all(na %in% colnames(seq))) { - nabases <- c2s(na[which(!na %in% colnames(seq))],sep=" ") - stop(paste("requested type is",type,"but",nabases,"is/are not in the colnames of supplied dataframe")) - } - # the formulas of each of the bases - f.base <- get("thermo")$obigt$formula[info(na.NA[match(colnames(seq),na)])] - # loop over the base counts - f.out <- character() - for(i in 1:nrow(seq)) { - # use makeup() with multipliers and sum=TRUE 20120119 jmd - f <- as.chemical.formula(makeup(f.base, multiplier=as.numeric(seq[i,]), sum=TRUE)) - f.out <- c(f.out,f) - } - return(f.out) - } else { - # count the numbers of nucleic acid bases in a sequence - # sequences are given as elements of the list seq - # to count the number of each amino acids in a sequence - count.na <- function(na,seq) sum(seq==na) - count.i <- function(i,seq) as.numeric(lapply(na,count.na,strsplit(toupper(seq[i]),"")[[1]])) - # count bases in each sequence - n <- t(as.data.frame(palply(1:length(seq),count.i,seq),optional=TRUE)) - n <- as.data.frame(n) - # clean up row/column names - colnames(n) <- na - rownames(n) <- 1:nrow(n) - # return the complement if requested e.g. - # nucleicacids(x,type,"DNA") # DNA complement - # nucleicacids(x,type,"RNA") # RNA complement - # nucleicacids(x,type,"DNA","RNA") # DNA, then RNA complement - if(!is.null(comp)) { - if(comp=="DNA") colnames(n) <- c("T","G","C","A") - else if(comp=="RNA") colnames(n) <- c("U","G","C","A") - else stop(paste("invalid complement request:",comp)) - } - if(!is.null(comp2)) { - if(comp2=="DNA") colnames(n) <- c("A","C","G","T") - else if(comp2=="RNA") colnames(n) <- c("A","C","G","U") - else stop(paste("invalid complement request:",comp)) - } - return(n) + letts <- c("A", "C", "G", "T", "U") + names <- c("adenine", "cytosine", "guanine", "thymine", "uracil") + # the locations of the letters in the data frame + i.lett <- match(letts, colnames(nucleic)) + # we'll normally have at least one NA (U or A for DNA or RNA) + ina <- is.na(i.lett) + # the species indices of the bases, in the order appearing above + i.base <- suppressMessages(info(names[!ina], check.it=FALSE)) + # the chemical formula of bases + f.base <- get("thermo")$obigt$formula[i.base] + # loop over the base counts + f.out <- character() + for(i in 1:nrow(nucleic)) { + # use makeup() with multipliers and sum=TRUE 20120119 jmd + f <- as.chemical.formula(makeup(f.base, multiplier=as.numeric(nucleic[i, i.lett[!ina]]), sum=TRUE)) + f.out <- c(f.out, f) } + return(f.out) } - +nucleic.complement <- function(nucleic=NULL, type="DNA") { + # return the nucleobase complement + # nucleic.complement(nucleic, "DNA") # DNA complement + # nucleic.complement(nucleic, "RNA") # RNA complement + # the reference sequence, and its DNA and RNA complements + ref <- c("A", "C", "G", "T", "U") + DNA <- c("T", "G", "C", "A", "A") + RNA <- c("U", "G", "C", "A", "A") + iref <- match(colnames(nucleic), ref) + i.base <- which(!is.na(iref)) + colnames(nucleic)[i.base] <- get(type)[iref[i.base]] + # be nice and re-alphabetize the columns + o.base <- order(colnames(nucleic)[i.base]) + nucleic <- nucleic[, i.base[o.base], drop=FALSE] + return(nucleic) +} Modified: pkg/CHNOSZ/inst/NEWS =================================================================== --- pkg/CHNOSZ/inst/NEWS 2013-05-06 14:35:56 UTC (rev 52) +++ pkg/CHNOSZ/inst/NEWS 2013-06-02 05:15:42 UTC (rev 53) @@ -1,4 +1,4 @@ -CHANGES IN CHNOSZ 1.0.0-1 (2013-05-06) +CHANGES IN CHNOSZ 1.0.0-2 (2013-06-02) -------------------------------------- - Fix IAPWS-95 calculations for subcrt(): in water.IAPWS95(), rename @@ -11,6 +11,11 @@ - In water.AW90(), analytical solution to polynomial equation is now used to calculate dielectric constant. Thanks to Marc Neveu. +- count.aa() and read.fasta() handle DNA sequences, with new argument + 'type' (protein or DNA). + +- nucleicacids() split into nucleic.formula() and nucleic.complement(). + CHANGES IN CHNOSZ 1.0.0 (2013-03-28) ------------------------------------ Modified: pkg/CHNOSZ/inst/tests/test-util.seq.R =================================================================== --- pkg/CHNOSZ/inst/tests/test-util.seq.R 2013-05-06 14:35:56 UTC (rev 52) +++ pkg/CHNOSZ/inst/tests/test-util.seq.R 2013-06-02 05:15:42 UTC (rev 53) @@ -1,9 +1,19 @@ context("util.seq") test_that("count.aa() warns about unrecognized amino acids and performs substring operations", { - expect_message(count.aa("ABCDEFGHIJ"), "count.aa: unrecognized amino acid code\\(s\\): B J") + expect_message(count.aa("ABCDEFGHIJ"), "count.aa: unrecognized letter\\(s\\) in protein sequence: B J") myseq <- "AAAAAGGGGG" expect_equal(count.aa(myseq, stop=5)[, "G"], 0) expect_equal(count.aa(myseq, start=6)[, "A"], 0) expect_equal(count.aa(myseq, start=5, stop=6)[, c("A", "G")], c(1, 1), check.attributes=FALSE) }) + +test_that("nucleobase sequences can be processed with count.aa(), nucleic.formula() and nucleic.complement()", { + expect_message(dna <- count.aa("ABCDEFGHIJ", type="DNA"), "count.aa: unrecognized letter\\(s\\) in DNA sequence: B D E F H I J") + expect_equal(as.numeric(dna), c(1, 1, 1, 0)) + expect_equal(nucleic.formula(dna), "C14H15N13O2") + # nucleobases can be in any order + expect_equal(nucleic.formula(dna[, 4:1, drop=FALSE]), "C14H15N13O2") + # ACG -> UGC (RNA complement) + expect_equal(nucleic.formula(nucleic.complement(dna, "RNA")), "C13H14N10O4") +}) Modified: pkg/CHNOSZ/man/util.fasta.Rd =================================================================== --- pkg/CHNOSZ/man/util.fasta.Rd 2013-05-06 14:35:56 UTC (rev 52) +++ pkg/CHNOSZ/man/util.fasta.Rd 2013-06-02 05:15:42 UTC (rev 53) @@ -14,8 +14,8 @@ grep.file(file, pattern = "", y = NULL, ignore.case = TRUE, startswith = ">", lines = NULL, grep = "grep") read.fasta(file, i = NULL, ret = "count", lines = NULL, - ihead = NULL, pnff = FALSE, start=NULL, stop=NULL) - count.aa(seq, start=NULL, stop=NULL) + ihead = NULL, start=NULL, stop=NULL, type="protein") + count.aa(seq, start=NULL, stop=NULL, type="protein") uniprot.aa(protein, start=NULL, stop=NULL) } @@ -30,9 +30,9 @@ \item{i}{numeric, line numbers of sequence headers to read} \item{ret}{character, specification for type of return (count, sequence, or FASTA format)} \item{ihead}{numeric, which lines are headers} - \item{pnff}{logical, get the protein name from the filename?} \item{start}{numeric, position in sequence to start counting} \item{stop}{numeric, position in sequence to stop counting} + \item{type}{character, sequence type (protein or DNA)} \item{seq}{character, amino acid sequence of a protein} \item{protein}{character, entry name for protein in UniProt} } @@ -53,10 +53,11 @@ If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}. Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to ""). -\code{count.aa} counts the occurrences of each amino acid in a sequence (\code{seq}), returning a data frame with amino acids in the same order as \code{thermo$protein}. -It is not case-sensitive. -A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid abbreviations. -\code{start} and/or \code{stop} can be provided to count amino acids in a fragment of the sequence (extracted using \code{\link{substr}}). +\code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}). +For amino acids, the columns in the returned data frame are in the same order as \code{thermo$protein}. +Letters are matched without regard for case. +A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid or nucleobase abbreviations. +\code{start} and/or \code{stop} can be provided to count a fragment of the sequence (extracted using \code{\link{substr}}). If only one of \code{start} or \code{stop} is present, the other defaults to 1 (\code{start}) or the length of the sequence (\code{stop}). \code{uniprot.aa} returns a data frame of amino acid composition, in the format of \code{thermo$protein}, retrieved from the protein sequence if it is available from UniProt (\url{http://uniprot.org}; The UniProt Consortium, 2012). @@ -71,7 +72,7 @@ } \seealso{ -\code{\link{nucleicacids}} for counting occurrences of bases in a DNA or RNA sequence. +\code{\link{nucleic.formula}} for an example of counting nucleobases in a DNA sequence. When computing relative abundances of many proteins that might be found with \code{grep.file} and \code{read.fasta}, consider using the \code{iprotein} arugment of \code{\link{affinity}} to speed things up; for an example see the help page for \code{\link{revisit}}. } Modified: pkg/CHNOSZ/man/util.seq.Rd =================================================================== --- pkg/CHNOSZ/man/util.seq.Rd 2013-05-06 14:35:56 UTC (rev 52) +++ pkg/CHNOSZ/man/util.seq.Rd 2013-06-02 05:15:42 UTC (rev 53) @@ -1,9 +1,9 @@ \name{util.seq} \alias{util.seq} \alias{aminoacids} +\alias{nucleic.formula} +\alias{nucleic.complement} -\alias{nucleicacids} - \title{Functions to Work with Sequence Data} \description{ @@ -12,38 +12,42 @@ \usage{ aminoacids(nchar=1, which=NULL) - nucleicacids(seq, type = "DNA", comp = NULL, comp2 = NULL) + nucleic.formula(nucleic = NULL) + nucleic.complement(nucleic = NULL, type="DNA") } \arguments{ \item{nchar}{numeric, \eqn{1} to return one-letter, \eqn{3} to return three-letter abbreviations for amino acids} \item{which}{character, which amino acids to name} - \item{seq}{character, base sequence of a nucleic acid (\code{nucleicacids})} - \item{type}{character, type of nucleic acid sequence (DNA or RNA) (\code{nucleicads})} - \item{comp}{character, type of complement sequence} - \item{comp2}{character, type of second complement sequence} + \item{nucleic}{data frame, counts of nucleic-acid bases} + \item{type}{character, target type of nucleic acid (DNA or RNA)} } \details{ - Depending on the value of \code{nchar}, \code{aminoacids} returns the one-letter abbreviations (\samp{1}) or the three-letter abbreviations (\samp{3}) or the names of the neutral amino acids (\samp{""}) or the names of the amino acids with ionized side chains (\samp{"Z"}). The default includes 20 amino acids in the order used in \code{thermo$protein}, unless \code{which} is provided, indicating the desired amino acids (either as 1- or 3-letter abbreviations or names of the neutral amino acids). +\code{aminoacids} returns the one-letter abbreviations (\code{nchar}=\samp{1}) or the three-letter abbreviations (\code{nchar}=\samp{3}) or the names of the neutral amino acids (\code{nchar}=\samp{""}) or the names of the amino acids with ionized side chains (\code{nchar}=\samp{"Z"}). +The output includes 20 amino acids in alphabetic order by 1-letter abbreviation (the order used in \code{thermo$protein}), unless \code{which} is provided, indicating the desired amino acids (either as 1- or 3-letter abbreviations or names of the neutral amino acids). - \code{aminoacids} takes a character value or list of character values containing a protein sequence. The number of occurrences of each type of amino acid is counted and output in a data frame with 20 columns, ordered in the same way as \code{thermo$protein}. - \code{nucleicacids} takes a DNA or RNA sequence and counts the numbers of bases of each type. Whether the sequence is DNA or RNA is specified by \code{type}. Setting \code{comp} to \samp{DNA} or \samp{RNA} tells the function to compute the base composition of that type of complement of the sequence. If \code{comp2} is specified, another complement is taken. The two rounds of complementing can be used in a single function call e.g. to go from a sequence on DNA minus strand (given in \code{seq}) to the plus strand (with \code{comp="DNA"}) and then from the DNA plus strand to RNA (with \code{comp2="RNA"}). The value returned by the function is a dataframe of base composition, which can be passed back to the function to obtain the overall chemical formula for the bases. +\code{nucleic.formula} returns a string representation of the chemical formula for each nucleic-acid composition contained in \code{nucleic}. +The names of the bases are indicated by the column names of \code{nucleic}. +At present, the formula is computed as the sum of the chemical formulas of the bases themselves, with no contribution from polymerization (dehydration) or phosphorylation. +\code{nucleic.complement} calculates the complement of the base composition given in \code{nucleic}. +\code{type} specifies the type of nucleic acid of the complement - \samp{DNA} (A, G, C, T) or \samp{RNA} (A, G, C, U). } +\seealso{\code{\link{count.aa}} for counting amino acids or nucleic-acid bases in a sequence; \code{\link{protein.formula}} for calculating the chemical formulas of proteins.} + \examples{\dontshow{data(thermo)} ## count nucleobases in a sequence -nucleicacids("ACCGGGTTT") +bases <- count.aa("ACCGGGTTT", type="DNA") # the DNA complement of that sequence -nucleicacids("ACCGGGTTT", comp="DNA") +DNA.comp <- nucleic.complement(bases) # the RNA complement of the DNA complement -n <- nucleicacids("ACCGGGTTT", comp="DNA", comp2="RNA") -# the formula of the RNA complement -(nf <- nucleicacids(n, type="RNA")) -stopifnot(identical(nf, "C40H42N32O11")) +RNA.comp <- nucleic.complement(DNA.comp, type="RNA") +# the formula of the RNA complement (bases only) +nucleic.formula(RNA.comp) # C40H42N32O11 } \keyword{util}