[CHNOSZ-commits] r828 - in pkg/CHNOSZ: . R inst inst/extdata/protein inst/tinytest man src vignettes
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Wed Feb 28 02:24:54 CET 2024
Author: jedick
Date: 2024-02-28 02:24:53 +0100 (Wed, 28 Feb 2024)
New Revision: 828
Removed:
pkg/CHNOSZ/R/util.fasta.R
pkg/CHNOSZ/inst/extdata/protein/EF-Tu.aln
pkg/CHNOSZ/inst/tinytest/test-util.seq.R
pkg/CHNOSZ/man/util.fasta.Rd
Modified:
pkg/CHNOSZ/DESCRIPTION
pkg/CHNOSZ/NAMESPACE
pkg/CHNOSZ/R/add.protein.R
pkg/CHNOSZ/inst/NEWS.Rd
pkg/CHNOSZ/inst/tinytest/test-add.protein.R
pkg/CHNOSZ/man/CHNOSZ-package.Rd
pkg/CHNOSZ/man/add.protein.Rd
pkg/CHNOSZ/man/extdata.Rd
pkg/CHNOSZ/man/palply.Rd
pkg/CHNOSZ/man/rank.affinity.Rd
pkg/CHNOSZ/man/util.seq.Rd
pkg/CHNOSZ/src/init.c
pkg/CHNOSZ/vignettes/anintro.Rmd
pkg/CHNOSZ/vignettes/mklinks.sh
Log:
Move read.fasta() and count.aa() to canprot
Modified: pkg/CHNOSZ/DESCRIPTION
===================================================================
--- pkg/CHNOSZ/DESCRIPTION 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/DESCRIPTION 2024-02-28 01:24:53 UTC (rev 828)
@@ -1,6 +1,6 @@
-Date: 2024-02-11
+Date: 2024-02-28
Package: CHNOSZ
-Version: 2.1.0
+Version: 2.1.0-1
Title: Thermodynamic Calculations and Diagrams for Geochemistry
Authors at R: c(
person("Jeffrey", "Dick", , "j3ffdick at gmail.com", role = c("aut", "cre"),
@@ -9,7 +9,7 @@
Author: Jeffrey Dick [aut, cre] (0000-0002-0687-5890)
Maintainer: Jeffrey Dick <j3ffdick at gmail.com>
Depends: R (>= 3.1.0)
-Suggests: tinytest, knitr, rmarkdown, tufte
+Suggests: tinytest, knitr, rmarkdown, tufte, canprot
Imports: grDevices, graphics, stats, utils
Description: An integrated set of tools for thermodynamic calculations in
aqueous geochemistry and geobiochemistry. Functions are provided for writing
Modified: pkg/CHNOSZ/NAMESPACE
===================================================================
--- pkg/CHNOSZ/NAMESPACE 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/NAMESPACE 2024-02-28 01:24:53 UTC (rev 828)
@@ -9,8 +9,8 @@
"describe.property", "describe.basis", "equilibrate",
"aminoacids", "ZC.col",
"pinfo", "protein.length", "protein.formula",
- "read.fasta", "protein.basis", "add.protein",
- "unitize", "seq2aa",
+ "protein.basis", "add.protein",
+ "unitize",
"thermo.refs", "mod.OBIGT",
# examples
"examples", "demos", "mtitle",
@@ -20,7 +20,6 @@
"mass", "entropy", "GHS", "water",
"i2A",
"dPdTtr", "Ttr",
- "count.aa",
"rho.IAPWS95", "IAPWS95", "water.AW90", "WP02.auxiliary", "water.IAPWS95",
"getrank", "parent", "sciname", "allparents", "getnodes", "getnames",
"protein.OBIGT", "which.pmax",
Modified: pkg/CHNOSZ/R/add.protein.R
===================================================================
--- pkg/CHNOSZ/R/add.protein.R 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/R/add.protein.R 2024-02-28 01:24:53 UTC (rev 828)
@@ -2,22 +2,6 @@
# Calculate properties of proteins 20061109 jmd
# Reorganize protein functions 20120513
-# Count numbers of amino acids in a sequence
-seq2aa <- function(sequence, protein = NA) {
- # Remove newlines and whitespace
- sequence <- gsub("\\s", "", gsub("[\r\n]", "", sequence))
- # Make a data frame from counting the amino acids in the sequence
- caa <- count.aa(sequence)
- colnames(caa) <- aminoacids(3)
- # Now make the data frame
- po <- strsplit(as.character(protein), "_")[[1]]
- aa <- data.frame(protein = po[1], organism = po[2], ref = NA, abbrv = NA, stringsAsFactors = FALSE)
- # chains = 1 for any sequence, chains = 0 for no sequence
- chains <- sum(nchar(sequence) > 0)
- aa <- cbind(aa, chains = chains, caa)
- return(aa)
-}
-
# Add amino acid counts to thermo()$protein (returns iprotein)
add.protein <- function(aa, as.residue = FALSE) {
# Add a properly constructed data frame of
Deleted: pkg/CHNOSZ/R/util.fasta.R
===================================================================
--- pkg/CHNOSZ/R/util.fasta.R 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/R/util.fasta.R 2024-02-28 01:24:53 UTC (rev 828)
@@ -1,151 +0,0 @@
-# CHNOSZ/util.fasta.R
-# Read and manipulate FASTA sequence files
-
-read.fasta <- function(file, iseq = NULL, ret = "count", lines = NULL, ihead = NULL,
- start = NULL, stop = NULL, type = "protein", id = NULL) {
- # Read sequences from a fasta file
- # Some of the following code was adapted from
- # read.fasta in package seqinR
- # value of 'iseq' is what sequences to read (default is all)
- # value of 'ret' determines format of return value:
- # count: amino acid composition (same columns as thermo()$protein, can be used by add.protein)
- # or nucleic acid base composition (A-C-G-T)
- # seq: amino acid sequence
- # fas: fasta entry
- # value of 'id' is used for 'protein' in output table,
- # otherwise ID is parsed from FASTA header (can take a while)
-
- # Check if the file is in an archive (https://github.com/jimhester/archive)
- if(inherits(file, "archive_read")) {
- is.archive <- TRUE
- filebase <- gsub("]", "", basename(summary(file)$description))
- } else {
- is.archive <- FALSE
- filebase <- basename(file)
- }
- if(is.null(lines)) {
- message("read.fasta: reading ", filebase, " ... ", appendLF = FALSE)
- is.nix <- Sys.info()[[1]] == "Linux"
- if(is.archive) {
- # We can't use scan here?
- lines <- readLines(file)
- } else if(is.nix) {
- # Retrieve contents using system command (seems slightly faster even than scan())
- # Figure out whether to use 'cat', 'zcat' or 'xzcat'
- suffix <- substr(file,nchar(file)-2,nchar(file))
- if(suffix == ".gz") mycat <- "zcat"
- else if(suffix == ".xz") mycat <- "xzcat"
- else mycat <- "cat"
- lines <- system(paste(mycat,' "',file,'"',sep = ""),intern = TRUE)
- } else lines <- scan(file, what = character(), sep = "\n", quiet = TRUE)
- }
- nlines <- length(lines)
- message(nlines, " lines ... ", appendLF = FALSE)
- if(is.null(ihead)) ihead <- which(substr(lines, 1, 1) == ">")
- message(length(ihead), " sequences")
- linefun <- function(i1, i2) lines[i1:i2]
- # Identify the lines that begin and end each sequence
- begin <- ihead + 1
- end <- ihead - 1
- end <- c(end[-1], nlines)
- # Use all or selected sequences
- if(is.null(iseq)) iseq <- seq_along(begin)
- # Just return the lines from the file
- if(ret == "fas") {
- iline <- numeric()
- for(i in iseq) iline <- c(iline, (begin[i]-1):end[i])
- return(lines[iline])
- }
- # Get each sequence from the begin to end lines
- seqfun <- function(i) paste(linefun(begin[i], end[i]), collapse = "")
- sequences <- lapply(iseq, seqfun)
- # Organism name is from file name
- # (basename minus extension)
- bnf <- strsplit(filebase, split = ".", fixed = TRUE)[[1]][1]
- organism <- bnf
- # Protein/gene name is from header line for entry
- # (strip the ">" and go to the first space)
- missid <- missing(id)
- if(is.null(id)) id <- as.character(lapply(iseq, function(j) {
- # Get the text of the line
- f1 <- linefun(ihead[j], ihead[j])
- # Stop if the first character is not ">"
- # or the first two charaters are "> "
- if(substr(f1, 1, 1) != ">" | length(grep("^> ", f1)>0))
- stop(paste("file", filebase, "line", j, "doesn't begin with FASTA header '>'."))
- # Discard the leading '>'
- f2 <- substr(f1, 2, nchar(f1))
- # Keep everything before the first space
- return(strsplit(f2, " ")[[1]][1])
- } ))
- if(ret == "count") {
- counts <- count.aa(sequences, start, stop, type)
- ref <- abbrv <- NA
- chains <- 1
- if(type == "protein") {
- colnames(counts) <- aminoacids(3)
- # 20090507 Made stringsAsFactors FALSE
- out <- cbind(data.frame(protein = id, organism = organism,
- ref = ref, abbrv = abbrv, chains = chains, stringsAsFactors = FALSE), counts)
- # 20170117 Extra processing for files from UniProt
- isUniProt <- grepl("\\|......\\|.*_", out$protein[1])
- if(isUniProt & missid) {
- p1 <- sapply(strsplit(out$protein, "\\|"), "[", 1)
- p2 <- sapply(strsplit(out$protein, "\\|"), "[", 2)
- p3 <- sapply(strsplit(out$protein, "\\|"), "[", 3)
- out$abbrv <- sapply(strsplit(p3, "_"), "[", 1)
- out$organism <- sapply(strsplit(p3, "_"), "[", 2)
- out$protein <- paste0(p1, "|", p2)
- }
- out
- } else if(type %in% c("DNA", "RNA")) {
- cbind(data.frame(gene = id, organism = organism,
- ref = ref, abbrv = abbrv, chains = chains, stringsAsFactors = FALSE), counts)
- }
- } else return(sequences)
-}
-
-count.aa <- function(seq, start = NULL, stop = NULL, type = "protein") {
- # Count amino acids or DNA bases in one or more sequences given as elements of the list seq
- if(type == "protein") letts <- aminoacids(1)
- else if(type == "DNA") letts <- c("A", "C", "G", "T")
- else if(type == "RNA") letts <- c("A", "C", "G", "U")
- else stop(paste("unknown sequence type", type))
- # The numerical positions of the letters in alphabetical order (i.e. for amino acids, same order as in thermo()$protein)
- ilett <- match(letts, LETTERS)
- # The letters A-Z represented by raw values
- rawAZ <- charToRaw("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
- # To count the letters in each sequence
- countfun <- function(seq, start, stop) {
- # Get a substring if one or both of start or stop are given
- # If only one of start or stop is given, get a default value for the other
- if(!is.null(start)) {
- if(is.null(stop)) stop <- nchar(seq)
- seq <- substr(seq, start, stop)
- } else if(!is.null(stop)) {
- seq <- substr(seq, 1, stop)
- }
- ## The actual counting ...
- #nnn <- table(strsplit(toupper(seq), "")[[1]])
- # ... Replaced with C version 20180217
- counts <- .C(C_count_letters, seq, integer(26))[[2]]
- # which is equivalent to this R code:
- #rawseq <- charToRaw(toupper(seq))
- #counts <- sapply(rawAZ, function(x) sum(rawseq == x))
- return(counts)
- }
- # Counts for each sequence
- counts <- lapply(seq, countfun, start, stop)
- counts <- do.call(rbind, counts)
- # Check for letters that aren't in our alphabet
- ina <- colSums(counts[, -ilett, drop = FALSE]) > 0
- if(any(ina)) {
- message(paste("count.aa: unrecognized letter(s) in", type, "sequence:", paste(LETTERS[-ilett][ina], collapse = " ")))
- }
- counts <- counts[, ilett, drop = FALSE]
- # Clean up row/column names
- colnames(counts) <- letts
- rownames(counts) <- 1:nrow(counts)
- return(counts)
-}
-
Modified: pkg/CHNOSZ/inst/NEWS.Rd
===================================================================
--- pkg/CHNOSZ/inst/NEWS.Rd 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/inst/NEWS.Rd 2024-02-28 01:24:53 UTC (rev 828)
@@ -15,6 +15,15 @@
\newcommand{\Cp}{\ifelse{latex}{\eqn{C_P}}{\ifelse{html}{\out{<I>C<sub>P</sub></I>}}{Cp}}}
\newcommand{\DG0}{\ifelse{latex}{\eqn{{\Delta}G^{\circ}}}{\ifelse{html}{\out{Δ<I>G</I>°}}{ΔG°}}}
+\section{Changes in CHNOSZ version 2.1.0-1 (2024-02-28)}{
+
+ \itemize{
+ \item Move \code{read.fasta()} and \code{count.aa()} to canprot package.
+ \item Remove \code{seq2aa()}.
+ }
+
+}
+
\section{Changes in CHNOSZ version 2.1.0 (2024-02-11)}{
\subsection{NEW FEATURES}{
Deleted: pkg/CHNOSZ/inst/extdata/protein/EF-Tu.aln
===================================================================
--- pkg/CHNOSZ/inst/extdata/protein/EF-Tu.aln 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/inst/extdata/protein/EF-Tu.aln 2024-02-28 01:24:53 UTC (rev 828)
@@ -1,64 +0,0 @@
->EFTU1_ML-stem
-MAKEKFERTKPHVNVGTIGHVDHGKTTLTAAITTVLALKGLAEAKAYDQIDKAPEEKARG
-ITINIAHVEYETEKRHYAHIDCPGHADYIKNMITGAAQMDGAILVVAATDGPMPQTREHI
-LLARQVGVPYIVVFINKVDMVDDPELLELVEMEVRDLLSKYEFPGDEVPVIRGSALKALE
-APQKWYEKILELLDAVDEYIPTPERDVDKPFLMPIEDVFSITGRGTVVTGRIERGVIKPG
-DEVEIVGLSETKKTTVTSVEMFRKLLDEGIAGDNVGVLLRGIDKEEVERGQVLAKPGSIT
-PHTKFKAQVYVLKKEEGGRHTPFFNGYRPQFYFRTTDVTGIVELPEGVEMVMPGDNVEMT
-VELIYPIAMEEGLRFAIREGGRTVGAGVVTKIIE
->EFTU1_Alt-stem
-MAKEKFERTKPHVNIGTIGHVDHGKTTLTAAITKVLSLKGLAEAKAYDQIDNAPEEKARG
-ITINITHVEYETEKRHYAHIDCPGHADYIKNMITGAAQMDGAILVVAATDGPMPQTREHV
-LLARQVGVPYIVVFLNKCDMVDDEELLELVEMEVRELLSKYDFPGDDVPVIRGSALKALE
-GDNEWYKPILELMDAVDNYIPDPERDVDKPFLMPIEDVFSITGRGTVVTGRIERGVIKPG
-DEVEIVGLKETKKTTVTSVEMFRKLLDEGQAGDNVGCLLRGIEKEEVERGQVLAKPGSIT
-PHTKFEAQVYVLKKEEGGRHTPFFNNYRPQFYFRTTDVTGIVELPEGVEMVMPGDNVEMT
-VELIYPIAIEEGLRFAIREGGRTVGAGVVTKIIE
->EFTU1_ML-meso
-MAKEKFERTKPHVNIGTIGHVDHGKTTLTAAITTVLAKKGLAEARAYDQIDNAPEEKERG
-ITINISHVEYETEKRHYAHVDCPGHADYVKNMITGAAQMDGAILVVAATDGPMPQTREHI
-LLARQVGVPYIVVALNKCDMVDDEELLELVEMEVRELLSSYDFDGDDTPVIRVSALKALE
-GDEKWVEKILELMDAVDEYIPTPERDTDKPFLMPIEDVFTITGRGTVVTGRVERGVLKVG
-DEVEIVGIKETQKTTVTGIEMFRKLLDEAQAGDNVGLLLRGIKREDVERGQVLAKPGSIT
-PHTKFEAEVYVLSKEEGGRHTPFFNNYRPQFYFRTTDVTGVITLPEGTEMVMPGDNVEMT
-VELIAPIAMEEGLRFAIREGGRTVGAGRVTKIIK
->EFTU1_ECOLI
-MSKEKFERTKPHVNVGTIGHVDHGKTTLTAAITTVLAKTYGGAARAFDQIDNAPEEKARG
-ITINTSHVEYDTPTRHYAHVDCPGHADYVKNMITGAAQMDGAILVVAATDGPMPQTREHI
-LLGRQVGVPYIIVFLNKCDMVDDEELLELVEMEVRELLSQYDFPGDDTPIVRGSALKALE
-GDAEWEAKILELAGFLDSYIPEPERAIDKPFLLPIEDVFSISGRGTVVTGRVERGIIKVG
-EEVEIVGIKETQKSTCTGVEMFRKLLDEGRAGENVGVLLRGIKREEIERGQVLAKPGTIK
-PHTKFESEVYILSKDEGGRHTPFFKGYRPQFYFRTTDVTGTIELPEGVEMVMPGDNIKMV
-VTLIHPIAMDDGLRFAIREGGRTVGAGVVAKVLS
->EFTU1_THETH
-MAKGEFVRTKPHVNVGTIGHVDHGKTTLTAALTYVAAAENNVEVKDYGDIDKAPEERARG
-ITINTAHVEYETAKRHYSHVDCPGHADYIKNMITGAAQMDGAILVVSAADGPMPQTREHI
-LLARQVGVPYIVVFMNKVDMVDDPELLDLVEMEVRDLLNQYEFPGDEVPVIRGSALLALE
-QMHEWVDKIWELLDAIDEYIPTPVRDVDKPFLMPVEDVFTITGRGTVATGRIERGKVKVG
-DEVEIVGLAETRRTVVTGVEMHRKTLQEGIAGDNVGVLLRGVSREEVERGQVLAKPGSIT
-PHTKFEASVYVLKKEEGGRHTGFFSGYRPQFYFRTTDVTGVVQLPPGVEMVMPGDNVTFT
-VELIKPVALEEGLRFAIREGGRTVGAGVVTKILE
->EFTU_THEMA
-MAKEKFVRTKPHVNVGTIGHIDHGKSTLTAAITKYLSLKGLAQYIPYDQIDKAPEEKARG
-ITINITHVEYETEKRHYAHIDCPGHADYIKNMITGAAQMDGAILVVAATDGPMPQTREHV
-LLARQVEVPYMIVFINKTDMVDDPELIDLVEMEVRDLLSQYGYPGDEVPVIRGSALKAVE
-APNEAYKPIQELLDAMDNYIPDPQRDVDKPFLMPIEDVFSITGRGTVVTGRIERGRIRPG
-DEVEIIGLSEIKKTVVTSVEMFRKELDEGIAGDNVGCLLRGIDKDEVERGQVLAAPGSIK
-PHKRFKAQIYVLKKEEGGRHTPFTKGYKPQFYIRTADVTGIVGLPEGVEMVMPGDHVEME
-IELIYPVAIEKGQRFAVREGGRTVGAGVVTEVIE
->EFTU1_ML-stem-DE
-MAKEKFERTKPHVNVGTIGHVDHGKTTLTAAITTVLALKGLAEAKAYEQIDKAPEEKARG
-ITINIAHVEYETEKRHYAHIDCPGHADYIKNMITGAAQMDGAILVVAATDGPMPQTREHI
-LLARQVGVPYIVVFINKVDMVDDPELLELVEMEVRDLLSKYEFPGDEVPVIRGSALKALE
-APQKWYEKILELLDAVDEYIPTPERDVDKPFLMPIEDVFSITGRGTVVTGRIERGVIKPG
-DEVEIVGLSETKKTTVTSVEMFRKLLDEGIAGDNVGVLLRGIDKEEVERGQVLAKPGSIT
-PHTKFKAQVYVLKKEEGGRHTPFFNGYRPQFYFRTTDVTGIVELPEGVEMVMPGDNVEMT
-VELIYPIAMEEGLRFAIREGGRTVGAGVVTKIIE
->EFTU1_ML-stem-DE-ST
-MAKEKFERTKPHVNVGTIGHVDHGKTTLTAAITTVLALKGLAEAKAYEQIDKAPEEKARG
-ITINIAHVEYETEKRHYAHIDCPGHADYIKNMITGAAQMDGAILVVAATDGPMPQTREHI
-LLARQVGVPYIVVFINKVDMVDDPELLELVEMEVRDLLSKYEFPGDEVPVIRGTALKALE
-APQKWYEKILELLDAVDEYIPTPERDVDKPFLMPIEDVFSITGRGTVVTGRIERGVIKPG
-DEVEIVGLSETKKTTVTSVEMFRKLLDEGIAGDNVGVLLRGIDKEEVERGQVLAKPGSIT
-PHTKFKAQVYVLKKEEGGRHTPFFNGYRPQFYFRTTDVTGIVELPEGVEMVMPGDNVEMT
-VELIYPIAMEEGLRFAIREGGRTVGAGVVTKIIE
Modified: pkg/CHNOSZ/inst/tinytest/test-add.protein.R
===================================================================
--- pkg/CHNOSZ/inst/tinytest/test-add.protein.R 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/inst/tinytest/test-add.protein.R 2024-02-28 01:24:53 UTC (rev 828)
@@ -12,7 +12,7 @@
expect_equal(ip1, ip2, info = info)
info <- "Errors and messages occur in some circumstances"
-expect_error(add.protein(count.aa("AAA")), "does not have the same columns as thermo\\(\\)\\$protein", info = info)
+expect_error(add.protein(canprot::count.aa("AAA")), "does not have the same columns as thermo\\(\\)\\$protein", info = info)
expect_message(add.protein(pinfo(pinfo("CYC_BOVIN"))), "replaced 1 existing protein\\(s\\)", info = info)
info <- "group additivity for proteins gives expected values"
@@ -33,12 +33,13 @@
expect_equal(formula, lprop$formula, info = info)
info <- "read.fasta() identifies sequences correctly and gives amino acid compositions in the correct format"
-ffile <- system.file("extdata/protein/EF-Tu.aln", package = "CHNOSZ")
-aa <- read.fasta(ffile)
-expect_equal(aa[1, ], read.fasta(ffile, 1), info = info)
+ffile <- system.file("extdata/protein/rubisco.fasta", package = "CHNOSZ")
+aa <- canprot::read.fasta(ffile)
+expect_equal(aa[1, ], canprot::read.fasta(ffile, 1), info = info)
# Use unlist here so that different row names are not compared
-expect_equal(unlist(aa[8, ]), unlist(read.fasta(ffile, 8)), info = info)
-expect_message(ip1 <- add.protein(aa), "added 8 new protein\\(s\\)", info = info)
-expect_message(ip2 <- add.protein(aa), "replaced 8 existing protein\\(s\\)", info = info)
+aa8 <- canprot::read.fasta(ffile, 1:8)
+expect_equal(unlist(aa[1:8, ]), unlist(aa8), info = info)
+expect_message(ip1 <- add.protein(aa8), "added 8 new protein\\(s\\)", info = info)
+expect_message(ip2 <- add.protein(aa8), "replaced 8 existing protein\\(s\\)", info = info)
# add.protein should return the correct indices for existing proteins
expect_equal(ip1, ip2, info = info)
Deleted: pkg/CHNOSZ/inst/tinytest/test-util.seq.R
===================================================================
--- pkg/CHNOSZ/inst/tinytest/test-util.seq.R 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/inst/tinytest/test-util.seq.R 2024-02-28 01:24:53 UTC (rev 828)
@@ -1,18 +0,0 @@
-# Load default settings for CHNOSZ
-reset()
-
-info <- "count.aa() warns about unrecognized amino acids and performs substring operations"
-expect_message(count.aa("ABCDEFGHIJ"), "count.aa: unrecognized letter\\(s\\) in protein sequence: B J", info = info)
-myseq <- "AAAAAGGGGG"
-expect_equal(count.aa(myseq, stop = 5)[, "G"], 0, info = info)
-expect_equal(count.aa(myseq, start = 6)[, "A"], 0, info = info)
-expect_equal(count.aa(myseq, start = 5, stop = 6)[, c("A", "G")], c(1, 1), check.attributes = FALSE, info = info)
-
-info <- "Nucleobase sequences can be processed with count.aa()"
-expect_message(dna <- count.aa("ABCDEFGHIJ", type = "DNA"), "count.aa: unrecognized letter\\(s\\) in DNA sequence: B D E F H I J", info = info)
-expect_equal(as.numeric(dna), c(1, 1, 1, 0), info = info)
-
-info <- "count.aa() correctly processes a longer nucleobase sequence"
-seq <- "ATGTCCCGTTTCTTAGTTGCATTGGTTGCCGCACTTTTAGGAGTTGCAATTGAGATGTCCCTTCTCGTTCGCGCTCAGGGGCAGCAAACCTTGCTTTTGGCTGAAGAAAGCAAGCATTTGTCGCAATTGCGTCAACTGACTTTTGAAGGCACCAATGCCGAAGCGTATTGGTCGCCTGACGGGAAATGGTTGGTCTTTCAATCCACACGCCCACCTTACAAGGCTGACCAAATCTTCATCATGAGAGCGGATGGCTCGGGAGTTCGTGTCGTCAGCACGGGCAAAGGTCGTTGCACTTGTGCCTATTTCACGCCAGATGGCAAAGGCGTTATCTTTGCTACGACCCACCTTGCTGGACCAGAACCGCCGCAAGTGCCCAAACTGGACATTCCACGCTATGTTTGGGGCGTGTTCCCAAGTTACGAACTTTACCTGCGGCGTTTGGACACGATGGAACTTATCCGCTTGACCGATAACGAAGGCTACGACGCTGAAGCGACCATTTGCTGGAAGACTGGGCGAATTGTCTTCACAAGTTACCGCAATGGCGACCTTGACCTTTACAGCATGAAATTAGACGGCAGCGATTTGAAGCGATTGACGAAAACCATCGGCTACGAGGGCGGAGCGTTCTACTCGCCCGACGGGAAGCGGATTGTCTTCCGAGCCTATTTGCCAAAGACGCCTGACGAAATTGACGAATACAAGCGGTTGCTCCAGTTAGGCGTCATAAGCCCACCAAAGATGGAGTGGGTCGTCATGGACGCCGACGGTCGCAACATGAAGCAAATC"
-counts <- data.frame(A = 190, C = 203, G = 211, T = 188)
-expect_equal(as.numeric(count.aa(seq, type = "DNA")), as.numeric(counts), info = info)
Modified: pkg/CHNOSZ/man/CHNOSZ-package.Rd
===================================================================
--- pkg/CHNOSZ/man/CHNOSZ-package.Rd 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/CHNOSZ-package.Rd 2024-02-28 01:24:53 UTC (rev 828)
@@ -24,7 +24,7 @@
\item Thermodynamic data: \code{\link{data}}, \code{\link{extdata}}, \code{\link{add.OBIGT}}, \code{\link{util.data}}
\item Thermodynamic calculations: \code{\link{util.formula}}, \code{\link{makeup}}, \code{\link{util.units}}, \code{\link{Berman}}, \code{\link{nonideal}}, \code{\link{util.misc}}
\item Water properties: \code{\link{water}}, \code{\link{util.water}}, \code{\link{DEW}}, \code{\link{IAPWS95}}
- \item Protein properties: \code{\link{protein.info}}, \code{\link{add.protein}}, \code{\link{util.fasta}}, \code{\link{util.protein}}, \code{\link{util.seq}}, \code{\link{ionize.aa}}
+ \item Protein properties: \code{\link{protein.info}}, \code{\link{add.protein}}, \code{\link{util.protein}}, \code{\link{util.seq}}, \code{\link{ionize.aa}}
\item Other tools: \code{\link{examples}}, \code{\link{taxonomy}}
\item Utility functions: \code{\link{util.expression}}, \code{\link{util.plot}}, \code{\link{util.array}}, \code{\link{util.list}}, \code{\link{palply}}
}
Modified: pkg/CHNOSZ/man/add.protein.Rd
===================================================================
--- pkg/CHNOSZ/man/add.protein.Rd 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/add.protein.Rd 2024-02-28 01:24:53 UTC (rev 828)
@@ -1,7 +1,6 @@
\encoding{UTF-8}
\name{add.protein}
\alias{add.protein}
-\alias{seq2aa}
\alias{aasum}
\title{Amino Acid Compositions of Proteins}
\description{
@@ -10,7 +9,6 @@
\usage{
add.protein(aa, as.residue = FALSE)
- seq2aa(sequence, protein = NA)
aasum(aa, abundance = 1, average = FALSE, protein = NULL, organism = NULL)
}
@@ -17,7 +15,6 @@
\arguments{
\item{aa}{data frame, amino acid composition in the format of \code{thermo()$protein}}
\item{as.residue}{logical, normalize by protein length?}
- \item{sequence}{character, protein sequence}
\item{protein}{character, name of protein; numeric, indices of proteins (rownumbers of \code{\link{thermo}()$protein})}
\item{abundance}{numeric, abundances of proteins}
\item{average}{logical, return the weighted average of amino acid counts?}
@@ -31,11 +28,7 @@
The purpose of the functions described here is to identify proteins and work with their amino acid compositions.
From the amino acid compositions, the thermodynamic properties of the proteins can be estimated by group additivity.
-\code{seq2aa} returns a data frame of amino acid composition for the provided \code{sequence}, in the format of \code{thermo()$protein}.
-In this function, the value of the \code{protein} argument is put into the \code{protein} column of the result.
-If there is an underscore (e.g. \samp{LYSC_CHICK}), it is used to split the text, and the two parts are put into the \code{protein} and \code{organism} columns.
-
-Given amino acid compositions returned by \code{seq2aa}, \code{add.protein} adds them to \code{thermo()$protein} for use by other functions in CHNOSZ.
+Given a data frame of amino acid compositions in the format of \code{thermo()$protein}, \code{add.protein} adds them to \code{thermo()$protein} for use by other functions in CHNOSZ.
The amino acid compositions of proteins in \code{aa} with the same name as one in \code{thermo()$protein} are replaced.
Set \code{as.residue} to TRUE to normalize by protein length; each input amino acid composition is divided by the corresponding number of residues, with the result that the sum of amino acid frequencies for each protein is 1.
@@ -46,7 +39,6 @@
}
\value{
-For \code{seq2aa}, a data frame of amino acid composition and identifying information for proteins.
For \code{add.protein}, the rownumbers of \code{thermo()$protein} that are added and/or replaced.
For \code{aasum}, a one-row data frame of amino acid composition and identifying information.
}
@@ -53,27 +45,22 @@
\examples{
\dontshow{reset()}
-# Get the amino acid composition of a protein sequence
-# (Human Gastric juice peptide 1)
-aa <- seq2aa("LAAGKVEDSD", "GAJU_HUMAN")
-# Add the protein to CHNOSZ
-ip <- add.protein(aa)
-# Calculate the protein length and chemical formula
-protein.length(ip) # 10
-as.chemical.formula(protein.formula(ip)) # "C41H69N11O18"
+# Read a file with the amino acid compositions of several poliovirus protein subunits
+file <- system.file("extdata/protein/POLG.csv", package = "CHNOSZ")
+aa <- read.csv(file)
-# Calculate a formula without using add.protein
-aa <- seq2aa("ANLSG", "pentapeptide_test")
-as.chemical.formula(protein.formula(aa))
+# Add the proteins to CHNOSZ
+iprotein <- add.protein(aa)
+# Calculate length and elemental formula
+protein.length(iprotein)
+protein.formula(iprotein)
-# Sum the amino acid compositions of several poliovirus protein subunits
-file <- system.file("extdata/protein/POLG.csv", package = "CHNOSZ")
-aa <- read.csv(file, as.is = TRUE)
+# Sum the amino acid compositions and assign a new protein name
aasum(aa, protein = "POLG_sum")
}
\seealso{
-\code{\link{read.fasta}} for another way of getting amino acid compositions that can be used with \code{add.protein}.
+\code{\link[canprot]{read.fasta}} for reading amino acid compositions from FASTA files.
\code{\link{pinfo}} for protein-level functions (length, chemical formulas, reaction coefficients of basis species).
}
Modified: pkg/CHNOSZ/man/extdata.Rd
===================================================================
--- pkg/CHNOSZ/man/extdata.Rd 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/extdata.Rd 2024-02-28 01:24:53 UTC (rev 828)
@@ -39,7 +39,6 @@
Files in \code{protein} contain protein sequences and amino acid compositions for proteins.
\itemize{
- \item \code{EF-Tu.aln} consists of aligned sequences (394 amino acids) of elongation factor Tu (EF-Tu). The sequences correspond to those taken from UniProtKB for ECOLI (\emph{Escherichia coli}), THETH (\emph{Thermus thermophilus}) and THEMA (\emph{Thermotoga maritima}), and reconstructed ancestral sequences taken from Gaucher et al., 2003 (maximum likelihood bacterial stem and mesophilic bacterial stem, and alternative bacterial stem). See \code{\link{read.fasta}} for an example that uses this file.
\item \code{rubisco.fasta} Sequences of Rubisco obtained from UniProt (see Dick, 2014). See the vignette \viglink{anintro} for an example that uses this file.
\item \code{POLG.csv}
Amino acid compositions of a few proteins used for some tests and examples.
@@ -81,8 +80,6 @@
Gattiker, A., Michoud, K., Rivoire, C., Auchincloss, A. H., Coudert, E., Lima, T., Kersey, P., Pagni, M., Sigrist, C. J. A., Lachaize, C., Veuthey, A.-L., Gasteiger, E. and Bairoch, A. (2003) Automatic annotation of microbial proteomes in Swiss-Prot. \emph{Comput. Biol. Chem.} \bold{27}, 49--58. \doi{10.1016/S1476-9271(02)00094-4}
-Gaucher, E. A., Thomson, J. M., Burgan, M. F. and Benner, S. A (2003) Inferring the palaeoenvironment of ancient bacteria on the basis of resurrected proteins. \emph{Nature} \bold{425}(6955), 285--288. \doi{10.1038/nature01977}
-
Helgeson, H. C., Delany, J. M., Nesbitt, H. W. and Bird, D. K. (1978) Summary and critique of the thermodynamic properties of rock-forming minerals. \emph{Am. J. Sci.} \bold{278-A}, 1--229. \url{https://www.worldcat.org/oclc/13594862}
Hnědkovský, L., Wood, R. H. and Majer, V. (1996) Volumes of aqueous solutions of \CH4, \CO2, \H2S, and \NH3 at temperatures from 298.15 K to 705 K and pressures to 35 MPa. \emph{J. Chem. Thermodyn.} \bold{28}, 125--142. \doi{10.1006/jcht.1996.0011}
Modified: pkg/CHNOSZ/man/palply.Rd
===================================================================
--- pkg/CHNOSZ/man/palply.Rd 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/palply.Rd 2024-02-28 01:24:53 UTC (rev 828)
@@ -25,7 +25,7 @@
}
\seealso{
-\code{\link{read.fasta}}, \code{\link{count.aa}}, \code{\link{affinity}}, \code{\link{equil.boltzmann}} and \code{\link{equil.reaction}} for functions that use \code{palply}.
+\code{\link{affinity}}, \code{\link{equil.boltzmann}} and \code{\link{equil.reaction}} for functions that use \code{palply}.
Tests are in \file{tests/test-util.program.R}, and a \dQuote{real world} example is in \file{demo/density.R}.
}
Modified: pkg/CHNOSZ/man/rank.affinity.Rd
===================================================================
--- pkg/CHNOSZ/man/rank.affinity.Rd 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/rank.affinity.Rd 2024-02-28 01:24:53 UTC (rev 828)
@@ -41,7 +41,7 @@
datfile <- system.file("extdata/cpetc/rubisco.csv", package = "CHNOSZ")
fastafile <- system.file("extdata/protein/rubisco.fasta", package = "CHNOSZ")
dat <- read.csv(datfile)
-aa <- read.fasta(fastafile)
+aa <- canprot::read.fasta(fastafile)
groups <- sapply(c("A", "B", "E"), "==", dat$domain, simplify = FALSE)
names(groups) <- c("Archaea", "Bacteria", "Eukaryota")
ip <- add.protein(aa, as.residue = TRUE)
Deleted: pkg/CHNOSZ/man/util.fasta.Rd
===================================================================
--- pkg/CHNOSZ/man/util.fasta.Rd 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/util.fasta.Rd 2024-02-28 01:24:53 UTC (rev 828)
@@ -1,78 +0,0 @@
-\encoding{UTF-8}
-\name{util.fasta}
-\alias{util.fasta}
-\alias{read.fasta}
-\alias{count.aa}
-\title{Functions for Reading FASTA Files and Downloading from UniProt}
-
-\description{
- Search the header lines of a FASTA file, read protein sequences from a file, count numbers of amino acids in each sequence, and download sequences from UniProt.
-}
-
-\usage{
- read.fasta(file, iseq = NULL, ret = "count", lines = NULL,
- ihead = NULL, start=NULL, stop=NULL, type="protein", id = NULL)
- count.aa(seq, start=NULL, stop=NULL, type="protein")
-}
-
-\arguments{
- \item{file}{character, path to FASTA file}
- \item{iseq}{numeric, which sequences to read from the file}
- \item{ret}{character, specification for type of return (count, sequence, or FASTA format)}
- \item{lines}{list of character, supply the lines here instead of reading them from file}
- \item{ihead}{numeric, which lines are headers}
- \item{start}{numeric, position in sequence to start counting}
- \item{stop}{numeric, position in sequence to stop counting}
- \item{type}{character, sequence type (protein or DNA)}
- \item{id}{character, value to be used for \code{protein} in output table}
- \item{seq}{character, amino acid sequence of a protein}
-}
-
-\details{
-\code{read.fasta} is used to retrieve entries from a FASTA file.
-Use \code{iseq} to select the sequences to read (the default is all sequences).
-The function returns various formats depending on the value of \code{ret}.
-The default \samp{count} returns a data frame of amino acid counts (the data frame can be given to \code{\link{add.protein}} in order to add the proteins to \code{\link{thermo}$protein}), \samp{seq} returns a list of sequences, and \samp{fas} returns a list of lines extracted from the FASTA file, including the headers (this can be used e.g. to generate a new FASTA file with only the selected sequences).
-If the line numbers of the header lines were previously determined, they can be supplied in \code{ihead}.
-Optionally, the lines of a previously read file may be supplied in \code{lines} (in this case no file is needed so \code{file} should be set to "").
-When \code{ret} is \samp{count}, the names of the proteins in the resulting data frame are parsed from the header lines of the file, unless \code{id} is provided.
-If \code{id} is not given, and a UniProt FASTA header is detected (regular expression \code{"\\|......\\|.*_"}), information there (accession, name, organism) is split into the \code{protein}, \code{abbrv}, and \code{organism} columns of the resulting data frame.
-
-\code{count.aa} counts the occurrences of each amino acid or nucleic-acid base in a sequence (\code{seq}).
-For amino acids, the columns in the returned data frame are in the same order as \code{thermo()$protein}.
-The matching of letters is case-insensitive.
-A warning is generated if any character in \code{seq}, excluding spaces, is not one of the single-letter amino acid or nucleobase abbreviations.
-\code{start} and/or \code{stop} can be provided to count a fragment of the sequence (extracted using \code{\link{substr}}).
-If only one of \code{start} or \code{stop} is present, the other defaults to 1 (\code{start}) or the length of the sequence (\code{stop}).
-}
-
-\value{
-\code{read.fasta} returns a list of sequences or lines (for \code{ret} equal to \samp{seq} or \samp{fas}, respectively), or a data frame with amino acid compositions of proteins (for \code{ret} equal to \samp{count}) with columns corresponding to those in \code{\link{thermo}$protein}.
-}
-
-\seealso{
-\code{\link{seq2aa}}, like \code{count.aa}, counts amino acids in a user-input sequence, but returns a data frame in the format of \code{thermo()$protein}.
-}
-
-\examples{\dontshow{reset()}
-## Reading a protein FASTA file
-# The path to the file
-file <- system.file("extdata/protein/EF-Tu.aln", package = "CHNOSZ")
-# Read the sequences, and print the first one
-read.fasta(file, ret = "seq")[[1]]
-# Count the amino acids in the sequences
-aa <- read.fasta(file)
-# Compute lengths (number of amino acids)
-protein.length(aa)
-
-\dontrun{
-## Count amino acids in a sequence
-count.aa("GGSGG")
-# Warnings are issued for unrecognized characters
-atest <- count.aa("WhatAmIMadeOf?")
-# There are 3 "A" (alanine)
-atest[, "A"]
-}
-}
-
-\concept{Protein properties}
Modified: pkg/CHNOSZ/man/util.seq.Rd
===================================================================
--- pkg/CHNOSZ/man/util.seq.Rd 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/man/util.seq.Rd 2024-02-28 01:24:53 UTC (rev 828)
@@ -25,11 +25,13 @@
}
-\seealso{\code{\link{count.aa}} for counting amino acids or nucleic-acid bases in a sequence; \code{\link{protein.formula}} for calculating the chemical formulas of proteins.}
+\seealso{
+ \code{\link{protein.formula}} for calculating the chemical formulas of proteins.
+}
-\examples{\dontshow{reset()}
-## Count nucleobases in a sequence
-bases <- count.aa("ACCGGGTTT", type = "DNA")
+\examples{
+# Three-letter abbreviations of amino acids
+aminoacids(3)
}
\concept{Protein properties}
Modified: pkg/CHNOSZ/src/init.c
===================================================================
--- pkg/CHNOSZ/src/init.c 2024-02-11 14:48:58 UTC (rev 827)
+++ pkg/CHNOSZ/src/init.c 2024-02-28 01:24:53 UTC (rev 828)
@@ -13,19 +13,11 @@
{NULL, NULL, 0}
};
-/* .C calls */
-extern void count_letters(void *, void *);
-
-static const R_CMethodDef CEntries[] = {
- {"count_letters", (DL_FUNC) &count_letters, 2},
- {NULL, NULL, 0}
-};
-
void R_init_CHNOSZ(DllInfo *dll)
{
+ R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL);
+/*
R_registerRoutines(dll, CEntries, NULL, FortranEntries, NULL);
-/*
- R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL);
R_registerRoutines(dll, CEntries, NULL, NULL, NULL);
*/
[TRUNCATED]
To get the complete diff run:
svnlook diff /svnroot/chnosz -r 828
More information about the CHNOSZ-commits
mailing list