[Seqinr-commits] r1761 - in pkg: . R inst/sequences man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Tue Sep 20 14:59:55 CEST 2011
Author: simonpenel
Date: 2011-09-20 14:59:55 +0200 (Tue, 20 Sep 2011)
New Revision: 1761
Added:
pkg/R/draw.recstat.R
pkg/R/recstat.R
pkg/R/test.co.recstat.R
pkg/R/test.li.recstat.R
pkg/inst/sequences/ECOUNC.fsa
pkg/man/draw.recstat.Rd
pkg/man/recstat.Rd
pkg/man/test.co.recstat.Rd
pkg/man/test.li.recstat.Rd
Modified:
pkg/DESCRIPTION
Log:
Addition of the "recstat" function (Prediction of coding DNA sequences through the use of a Correspondence Analysis). Guy Perriere is now part of the seqinr project.
Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION 2011-05-30 14:37:05 UTC (rev 1760)
+++ pkg/DESCRIPTION 2011-09-20 12:59:55 UTC (rev 1761)
@@ -1,9 +1,9 @@
Package: seqinr
-Version: 3.0-3
-Date: 2011-04-05
+Version: 3.0-4
+Date: 2011-09-20
Title: Biological Sequences Retrieval and Analysis
-Author: Delphine Charif and Jean R. Lobry and Anamaria Necsulea and Leonor Palmeira and Simon Penel
-Maintainer: Simon Penel <penel at biomserv.univ-lyon1.fr>
+Author: Delphine Charif and Jean R. Lobry and Anamaria Necsulea and Leonor Palmeira and Simon Penel and Guy Perriere
+Maintainer: Simon Penel <simon.penel at univ-lyon1.fr>
Depends: R (>= 2.10.0)
Suggests: ade4, segmented
Description: Exploratory data analysis and data visualization
Added: pkg/R/draw.recstat.R
===================================================================
--- pkg/R/draw.recstat.R (rev 0)
+++ pkg/R/draw.recstat.R 2011-09-20 12:59:55 UTC (rev 1761)
@@ -0,0 +1,185 @@
+##
+# This function draws two graphics, one of the CA of a DNA sequence, and one of
+# start/stop codons positions in the three reading frames. This for the direct or
+# the reverse strand.
+##
+#v.18.08.2011
+draw.recstat <- function(rec, fac = 1, direct = TRUE, xlim = c(1, seqsize), col = c("red", "blue", "purple"))
+{
+ if (fac < 0 | 4 < fac)
+ { # test if factor is between 1 and 4
+ print("Factor number is not in 1:4.")
+ return()
+ }
+ seq <- rec[[1]] # recovery of elements of list rec
+ sizewin <- rec[[2]]
+ shift <- rec[[3]]
+ seqsize <- rec[[4]]
+ seqname <- rec[[5]]
+ vstopd <- rec[[8]]
+ vstopr <- rec[[9]]
+ vinitd <- rec[[10]]
+ vinitr <- rec[[11]]
+ recd <- rec[[14]]
+ recr <- rec[[15]]
+ if (xlim[1] < 1 | xlim[1] > seqsize)
+ {
+ xlim <- c(1, xlim[2])
+ }
+ if (seqsize < xlim[2] | 1 > xlim[2])
+ {
+ xlim <- c(xlim[1], seqsize)
+ }
+ par(mfrow = c(2, 1), mar = c(0, 4, 4, 2) + 0.1) # division of the window for a closer between plots
+ par(xaxs = "i")
+ seqisize <- floor((dim(recd$li)[1])/3) # number of window by reading frame, we take the integer part
+ if ((dim(recd$li)[1])%%3 == 1) # adaptation of number of window between each reading frame
+ {
+ seqisize1 <- seqisize + 1 # for fr1
+ seqisize2 <- seqisize # for fr2
+ }
+ if ((dim(recd$li)[1])%%3 == 2)
+ {
+ seqisize1 <- seqisize + 1
+ seqisize2 <- seqisize + 1
+ }
+ if ((dim(recd$li)[1])%%3 == 0)
+ {
+ seqisize1 <- seqisize
+ seqisize2 <- seqisize
+ }
+ ##
+ ##direct strand##
+ ##
+ if (direct)
+ {
+ plot((sizewin/2) + (0:(seqisize1 - 1))*shift, recd$li[1:seqisize1, fac], type = "l", lty = 1,
+ col = col[1], xlim = xlim, ylim = c(min(recd$li[, fac]), max(recd$li[, fac])),
+ main = "Direct strand", xlab = "", ylab = "Factor scores", bty = 'l') # reading frame 1
+ lines((sizewin/2) + (0:(seqisize2 - 1))*shift + 1, recd$li[(seqisize1 + 1):(seqisize1 + seqisize2), fac],
+ lty = 2, col = col[2], ylab = "2") # reading frame 2
+ lines((sizewin/2) + (0:(seqisize - 1))*shift + 2, recd$li[(seqisize1 + seqisize2 + 1):(dim(recd$li)[1]), fac],
+ lty = 3, col = col[3], ylab = "3") # reading frame 3
+ legend("topleft", legend = c(paste("Sequence name:", seqname), paste("Sequence length:", seqsize, "bp")),
+ inset = c(-0.15, -0.2), bty = "n", xpd = TRUE)
+ vstopdindphase <- numeric()
+ if (length(vstopd) > 0)
+ { # test if vector is not empty because problem with modulo
+ vstopdindphase <- sapply(1:length(vstopd), function(x)
+ { # index vector of reading frame of vector vstopd
+ if (vstopd[x]%%3 == 1)
+ {
+ vstopdindphase <- c(vstopdindphase, 2.5)
+ }
+ else
+ {
+ if (vstopd[x]%%3 == 2)
+ {
+ vstopdindphase <- c(vstopdindphase, 1.5)
+ }
+ else
+ {
+ vstopdindphase <- c(vstopdindphase, 0.5)
+ }
+ }
+ })
+ }
+ vinitdindphase <- numeric()
+ if (length(vinitd) > 0)
+ { # test if vector is not empty because problem with modulo
+ vinitdindphase <- sapply(1:length(vinitd), function(x)
+ { # index vector of reading frame of vector vinitd
+ if (vinitd[x]%%3 == 1)
+ {
+ vinitdindphase <- c(vinitdindphase, 3)
+ }
+ else
+ {
+ if (vinitd[x]%%3 == 2)
+ {
+ vinitdindphase <- c(vinitdindphase, 2)
+ }
+ else
+ {
+ vinitdindphase <- c(vinitdindphase, 1)
+ }
+ }
+ })
+ }
+ par(mar = c(5, 4, 3, 2) + 0.1)
+ plot(vstopd, vstopdindphase, pch = 25, cex = 0.7, xlim = xlim, ylim = c(0.25, 3), axes = TRUE,
+ ann = TRUE, tcl = -0.5, bty = 'l', yaxt = 'n', xlab = "Start/Stop positions (bp)",
+ ylab = '', xpd = FALSE) # stop codons positions
+ points(vinitd, vinitdindphase, pch = 24, bg = "slategray", cex = 0.7, col = 'slategray') # start codons positions
+ abline(h = c(3.1, 2.4, 2.1, 1.4, 1.1, 0.4), col = c(col[1], col[1], col[2], col[2], col[3], col[3]),
+ lty = c(1, 1, 2, 2, 3, 3))
+ text(x = (xlim[1]-(xlim[2]-xlim[1])*0.75/6), pos = 4, y = c(2.75, 1.75, 0.75), labels = paste("Ph. ", c(0, 1, 2)), xpd = TRUE)
+ }
+ ##
+ ##reverse strand##
+ ##
+ if (!direct)
+ {
+ plot((sizewin/2) + (0:(seqisize1 - 1))*shift, recr$li[1:seqisize1, fac], type = "l", lty = 1,
+ col = col[1], xlim = xlim, ylim = c(min(recr$li[, fac]), max(recr$li[, fac])),
+ main = "Reverse strand", xlab = "", ylab = "Factor scores", bty = 'l') # reading frame 1
+ lines((sizewin/2) + (0:(seqisize2-1))*shift + 1, recr$li[(seqisize1 + 1):(seqisize1 + seqisize2), fac],
+ lty = 2, col = col[2], ylab="2") # reading frame 2
+ lines((sizewin/2) + (0:(seqisize - 1))*shift + 2, recr$li[(seqisize1 + seqisize2 + 1):(dim(recr$li)[1]), fac],
+ lty = 3, col = col[3], ylab = "3") # reading frame 3
+ legend("topleft", legend = c(paste("Sequence name:", seqname), paste("Sequence length:", seqsize, "bp")),
+ inset = c(-0.15, -0.2), bty = "n", xpd = TRUE)
+ vstoprindphase <- numeric()
+ if (length(vstopr) > 0)
+ { # test if vector is not empty because problem with modulo
+ vstoprindphase <- sapply(1:length(vstopr), function(x)
+ { # index vector of reading frame of vector vstopr
+ if (vstopr[x]%%3 == 1)
+ {
+ vstoprindphase <- c(vstoprindphase, 2.5)
+ }
+ else
+ {
+ if (vstopr[x]%%3 == 2)
+ {
+ vstoprindphase <- c(vstoprindphase, 1.5)
+ }
+ else
+ {
+ vstoprindphase <- c(vstoprindphase, 0.5)
+ }
+ }
+ })
+ }
+ vinitrindphase <- numeric()
+ if (length(vinitr) > 0)
+ { # test if vector is not empty because problem with modulo
+ vinitrindphase <- sapply(1:length(vinitr), function(x)
+ { # index vector of reading frame of vector vinitr
+ if (vinitr[x]%%3 == 1)
+ {
+ vinitrindphase <- c(vinitrindphase, 3)
+ }
+ else
+ {
+ if (vinitr[x]%%3 == 2)
+ {
+ vinitrindphase <- c(vinitrindphase, 2)
+ }
+ else
+ {
+ vinitrindphase <- c(vinitrindphase, 1)
+ }
+ }
+ })
+ }
+ par(mar = c(5, 4, 3, 2) + 0.1)
+ plot(vstopr, vstoprindphase, pch = 25, cex = 0.7, xlim = xlim, ylim = c(0.25, 3), axes = TRUE,
+ ann = TRUE, tcl = -0.5, bty = 'l', yaxt = 'n', xlab = "Start/Stop positions (bp)",
+ ylab = '', xpd = FALSE) # stop codons positions
+ points(vinitr, vinitrindphase, pch = 24, bg = "slategray", cex = 0.7, col = 'slategray') # start codons positions
+ abline(h = c(3.1, 2.4, 2.1, 1.4, 1.1, 0.4), col = c(col[1], col[1], col[2], col[2], col[3], col[3]),
+ lty = c(1, 1, 2, 2, 3, 3))
+ text(x = (xlim[1]-(xlim[2]-xlim[1])*0.75/6), pos = 4, y = c(2.75, 1.75, 0.75), labels = paste("Ph. ", c(0, 1, 2)), xpd = TRUE)
+ }
+}
\ No newline at end of file
Property changes on: pkg/R/draw.recstat.R
___________________________________________________________________
Added: svn:executable
+ *
Added: pkg/R/recstat.R
===================================================================
--- pkg/R/recstat.R (rev 0)
+++ pkg/R/recstat.R 2011-09-20 12:59:55 UTC (rev 1761)
@@ -0,0 +1,65 @@
+##
+# This function counts the number of each triplet in a sliding window,
+# create a contingency table with triplet composition and then computes
+# a correspondence analysis on this table
+##
+#v.18.08.2011
+recstat <- function(seq, sizewin = 90, shift = 30, seqname = "no name")
+{
+ if (is.character(seq) == FALSE)
+ { # class character if only one sequence, class list if more
+ print("This file has more than one sequence or class is not character.")
+ return()
+ }
+ if ((shift%%3) != 0)
+ { # test if shift give the same reading frame
+ print("The windows are not in the same reading frame, please change shift value to a multiple of 3.")
+ return()
+ }
+ if (sizewin%%3 != 0)
+ {
+ print("The length of the window is not a multiple of 3, please change sizewin value.")
+ return()
+ }
+ seqsize <- length(seq) # give the number of 1-mer in the sequence
+ v1 <- seq(from = 1, to = seqsize, by = shift) # start vector of window in reading frame 1
+ v1 <- v1[1:(which((v1 + sizewin) > seqsize)[1])] # suppression if more than one incomplete window
+ v2 <- seq(from = 2, to = seqsize, by = shift)
+ v2 <- v2[1:(which((v2 + sizewin)>seqsize)[1])]
+ v3 <- seq(from = 3, to = seqsize, by = shift)
+ v3 <- v3[1:(which((v3 + sizewin) > seqsize)[1])]
+ vdep <- c(v1, v2, v3) # start vector in the 3 reading frames of direct/reverse strand
+ vind <- c(rep(1, length(v1)), rep(2, length(v2)), rep(3, length(v3))) # index vector of reading frame for each window
+ ##
+ ##direct strand##
+ ##
+ cseq <- c2s(seq)
+ vstopd <- c(words.pos("taa", cseq), words.pos("tag", cseq), words.pos("tga", cseq)) # vector of stop codons positions in direct strand
+ vinitd <- c(words.pos("atg", cseq)) # vector of start codons positions in direct strand
+ resd <- lapply(1:length(vdep), function(x)
+ { # calculation on 3 reading frames of direct strand
+ seq_tmp <- seq[(vdep[x]):(vdep[x] + sizewin - 1)] # temporary window
+ count(seq_tmp, wordsize = 3, start = 0, by = 3) # counting of triplets
+ })
+ ##
+ ##reverse strand##
+ ##
+ seq_reverse <- rev(comp(seq, ambiguous = TRUE)) # creation of reverse strand
+ cseq_reverse <- c2s(seq_reverse)
+ vstopr <- c(words.pos("taa", cseq_reverse), words.pos("tag", cseq_reverse), words.pos("tga", cseq_reverse)) # vector of stop codons positions in reverse strand
+ vinitr <- c(words.pos("atg", cseq_reverse)) # vector of init codons positions in reverse strand
+ resr <- lapply(1:length(vdep), function(x)
+ { # calculation on 3 reading frames of reverse strand
+ seq_tmp <- seq_reverse[(vdep[x]):(vdep[x] + sizewin - 1)] # temporary window
+ count(seq_tmp, wordsize = 3, start = 0, by = 3) # counting of triplets
+ })
+ resd <- matrix(unlist(resd), byrow = TRUE, ncol = 64) # conversion vector to contingency table
+ resr <- matrix(unlist(resr), byrow = TRUE, ncol = 64)
+ ##
+ ##CA##
+ ##
+ resd.coa <- dudi.coa(resd, scannf = FALSE, nf = 4) # CA on direct strand
+ resr.coa <- dudi.coa(resr, scannf = FALSE, nf = 4) # CA on reverse strand
+ rec <- list(seq, sizewin, shift, seqsize, seqname, vdep, vind, vstopd, vstopr, vinitd, vinitr, resd, resr, resd.coa, resr.coa)
+ return(rec)
+}
Property changes on: pkg/R/recstat.R
___________________________________________________________________
Added: svn:executable
+ *
Added: pkg/R/test.co.recstat.R
===================================================================
--- pkg/R/test.co.recstat.R (rev 0)
+++ pkg/R/test.co.recstat.R 2011-09-20 12:59:55 UTC (rev 1761)
@@ -0,0 +1,205 @@
+##
+# This function tests if a region located between two stop codons could be a putative CDS
+#
+# Data used are the factor scores of the CA computed on the windows by recstat function
+##
+#v.18.08.2011
+test.co.recstat <- function(rec, fac = 1, length.min = 150, stop.max = 0.2, win.lim = 0.8, direct = TRUE, level = 0.01)
+{
+ if (fac < 0 | 4 < fac)
+ { # test if factor is between 1 and 4
+ print("Factor number is not in 1:4.")
+ return()
+ }
+ seq <- rec[[1]] # recovery of elements of list n
+ sizewin <- rec[[2]]
+ shift <- rec[[3]]
+ seqsize <- rec[[4]]
+ vstopd <- rec[[8]]
+ vstopr <- rec[[9]]
+ resd <- rec[[12]]
+ resr <- rec[[13]]
+ recd <- rec[[14]]
+ recr <- rec[[15]]
+ if (seqsize < length.min)
+ {
+ print("Seqence length is shorter than minimum distance between two Stop codons.")
+ return()
+ }
+ table.recstat <- function(vstop)
+ {
+ tabCDS <- numeric() # initialization
+ j <- 0
+ for (i in 2:length(vstop))
+ { # for each stop codons positions vector
+ if ((vstop[i] - vstop[i - 1]) > length.min)
+ { # test if space between codons is above the threshold
+ # in each case gets the p-values between each stop codon and range it in vector seg
+ seg <- pvalvec[which((vstop[i - 1] - pvalvec[, 2])/(sizewin + 2) <= stop.max &
+ (vstop[i] - pvalvec[, 2])/(sizewin + 2) >= (1 - stop.max)), 1]
+ # create a table with calculation on those vectors seg then go to next space inter-codon, each row correspond to a space inter-stop codon
+ k <- 0
+ for (l in 1:length(seg))
+ {
+ if (seg[l] < level)
+ {
+ k <- k + 1
+ }
+ if (k/length(seg) > win.lim)
+ {
+ result <- 1
+ }
+ else
+ {
+ result <- 0
+ }
+ }
+ tabCDS <- c(tabCDS, vstop[i - 1]+3, vstop[i]+2, result)
+ j <- j + 1
+ }
+ }
+ tabCDS <- matrix(tabCDS, nrow = j, ncol = 3, byrow = TRUE) # conversion list to table
+ return(tabCDS)
+ }
+ ##
+ ##direct strand##
+ ##
+ if (direct)
+ {
+ vstopdindphase <- numeric()
+ if (length(vstopd) > 0)
+ { # test if vector is not empty because problem with modulo
+ vstopdindphase <- sapply(1:length(vstopd), function(x)
+ { # index vector of reading frame of vector vstopd
+ if (vstopd[x]%%3 == 1)
+ {
+ vstopdindphase <- c(vstopdindphase, 1)
+ }
+ else
+ {
+ if (vstopd[x]%%3 == 2)
+ {
+ vstopdindphase <- c(vstopdindphase, 2)
+ }
+ else
+ {
+ vstopdindphase <- c(vstopdindphase, 3)
+ }
+ }
+ })
+ }
+ vstop1 <- vstopd[vstopdindphase == 1] # vector with only stop codons in reading frame 1
+ vstop2 <- vstopd[vstopdindphase == 2] # vector with only stop codons in reading frame 2
+ vstop3 <- vstopd[vstopdindphase == 3] # vector with only stop codons in reading frame 3
+ vstop1 <- c(vstop1, 1-3, seqsize-(seqsize%%3)-2) # add start and end positions, "-3" and "-2" because of table.recstat()
+ vstop2 <- c(vstop2, 2-3, seqsize-((seqsize-1)%%3)-2)
+ vstop3 <- c(vstop3, 3-3, seqsize-((seqsize-2)%%3)-2)
+ vstop1 <- sort(unique(vstop1)) # sort of the vector
+ vstop2 <- sort(unique(vstop2))
+ vstop3 <- sort(unique(vstop3))
+ tab <- numeric()
+ for (i in 1:dim(resd)[1])
+ {
+ if (sum(resd[i, ]) == sizewin/3)
+ {
+ for (j in 1:64)
+ {
+ tab <- c(tab, rep(recd$co[j, 1], resd[i, j]))
+ }
+ }
+ }
+ tab <- matrix(unlist(tab), byrow = TRUE, ncol = sizewin/3)
+ seqisize <- floor((dim(tab)[1])/3)
+ phase <- c(rep(1, sizewin/3), rep(2, sizewin/3), rep(3, sizewin/3))
+ phase <- as.factor(phase)
+ pvalvec <- numeric()
+ for (i in 1:seqisize)
+ {
+ v1 <- tab[i,]
+ v2 <- tab[seqisize + i,]
+ v3 <- tab[2*seqisize + i,]
+ v <- c(v1, v2, v3)
+ x <- kruskal.test(v ~ phase)$p.value
+ pvalvec <- c(pvalvec, x, (i - 1)*shift + 1)
+ }
+ pvalvec <- matrix(unlist(pvalvec), byrow = TRUE, ncol = 2)
+# plot((sizewin/2) + (0:(seqisize - 1))*shift + 1, pvalvec[,1], type = "l")
+ tab1 <- table.recstat(vstop1)
+ colnames(tab1) <- c("Start", "End", "CDS") # definition of table headings
+ tab2 <- table.recstat(vstop2)
+ colnames(tab2) <- c("Start", "End", "CDS")
+ tab3 <- table.recstat(vstop3)
+ colnames(tab3) <- c("Start", "End", "CDS")
+ return(list(tab1, tab2, tab3))
+ }
+ ##
+ ##reverse strand##
+ ##
+ if (!direct)
+ {
+ vstoprindphase <- numeric()
+ if (length(vstopr) > 0)
+ {
+ vstoprindphase <- sapply(1:length(vstopr), function(x)
+ {
+ if (vstopr[x]%%3 == 1)
+ {
+ vstoprindphase <- c(vstoprindphase, 1)
+ }
+ else
+ {
+ if (vstopr[x]%%3 == 2)
+ {
+ vstoprindphase <- c(vstoprindphase, 2)
+ }
+ else
+ {
+ vstoprindphase <- c(vstoprindphase, 3)
+ }
+ }
+ })
+ }
+ vstop1 <- vstopr[vstoprindphase == 1]
+ vstop2 <- vstopr[vstoprindphase == 2]
+ vstop3 <- vstopr[vstoprindphase == 3]
+ vstop1 <- c(vstop1, 1-3, seqsize-(seqsize%%3)-2) # add start and end positions
+ vstop2 <- c(vstop2, 2-3, seqsize-((seqsize-1)%%3)-2)
+ vstop3 <- c(vstop3, 3-3, seqsize-((seqsize-2)%%3)-2)
+ vstop1 <- sort(unique(vstop1))
+ vstop2 <- sort(unique(vstop2))
+ vstop3 <- sort(unique(vstop3))
+ tab <- numeric()
+ for (i in 1:dim(resr)[1])
+ {
+ if (sum(resr[i, ]) == sizewin/3)
+ {
+ for (j in 1:64)
+ {
+ tab <- c(tab, rep(recr$co[j, 1], resr[i, j]))
+ }
+ }
+ }
+ tab <- matrix(unlist(tab), byrow = TRUE, ncol = sizewin/3)
+ seqisize <- floor((dim(tab)[1])/3)
+ phase <- c(rep(1, sizewin/3), rep(2, sizewin/3), rep(3, sizewin/3))
+ phase <- as.factor(phase)
+ pvalvec <- numeric()
+ for (i in 1:seqisize)
+ {
+ v1 <- tab[i,]
+ v2 <- tab[seqisize + i,]
+ v3 <- tab[2*seqisize + i,]
+ v <- c(v1, v2, v3)
+ x <- kruskal.test(v ~ phase)$p.value
+ pvalvec <- c(pvalvec, x, (i - 1)*shift + 1)
+ }
+ pvalvec <- matrix(unlist(pvalvec), byrow = TRUE, ncol = 2)
+ tab1 <- table.recstat(vstop1)
+ colnames(tab1) <- c("Start", "End", "CDS") # definition of table headings
+ tab2 <- table.recstat(vstop2)
+ colnames(tab2) <- c("Start", "End", "CDS")
+ tab3 <- table.recstat(vstop3)
+ colnames(tab3) <- c("Start", "End", "CDS")
+ return(list(tab1, tab2, tab3))
+ }
+}
\ No newline at end of file
Property changes on: pkg/R/test.co.recstat.R
___________________________________________________________________
Added: svn:executable
+ *
Added: pkg/R/test.li.recstat.R
===================================================================
--- pkg/R/test.li.recstat.R (rev 0)
+++ pkg/R/test.li.recstat.R 2011-09-20 12:59:55 UTC (rev 1761)
@@ -0,0 +1,180 @@
+##
+# This function tests if a region located between two stop codons could be a putative CDS
+#
+# Data used are the factor scores of the CA computed on the windows by recstat function
+##
+#v.18.08.2011
+test.li.recstat <- function(rec, fac = 1, length.min = 150, stop.max = 0.2, direct = TRUE, level = 0.05)
+{
+ if (fac < 0 | 4 < fac)
+ { # test if factor is between 1 and 4
+ print("Factor number is not in 1:4.")
+ return()
+ }
+ seq <- rec[[1]] # recovery of elements of list n
+ sizewin <- rec[[2]]
+ shift <- rec[[3]]
+ seqsize <- rec[[4]]
+ vdep <- rec[[6]]
+ vind <- rec[[7]]
+ vstopd <- rec[[8]]
+ vstopr <- rec[[9]]
+ recd <- rec[[14]]
+ recr <- rec[[15]]
+ if (seqsize < length.min)
+ {
+ print("Seqence length is shorter than minimum distance between two Stop codons.")
+ return()
+ }
+ table.recstat <- function(vstop, rec, frame)
+ {
+ tabCDS <- numeric() # initialization
+ j <- 0
+ for (i in 2:length(vstop))
+ { # for each stop codons positions vector
+ if ((vstop[i] - vstop[i - 1]) > length.min)
+ { # test if space between codons is above the threshold
+ # in each case gets the values between each stop codon for the 3 reading frames and range it in 3 vector seg
+ seg1 <- rec$li[which((vstop[i - 1] - vdep[1:seqisize1])/sizewin <= stop.max &
+ (vstop[i] - vdep[1:seqisize1])/sizewin >= (1 - stop.max)), fac]
+ seg2 <- rec$li[(which((vstop[i - 1] - vdep[(seqisize1 + 1):(seqisize1 + seqisize2)])/sizewin <= stop.max
+ & (vstop[i] - vdep[(seqisize1 + 1):(seqisize1 + seqisize2)])/sizewin >= (1 - stop.max)) + seqisize1), fac]
+ seg3 <- rec$li[(which((vstop[i - 1] - vdep[(seqisize1 + seqisize2 + 1):(length(vdep))])/sizewin <= stop.max
+ & (vstop[i] - vdep[(seqisize1 + seqisize2 + 1):(length(vdep))])/sizewin >= (1 - stop.max)) + seqisize1 + seqisize2), fac]
+ # create a table with calculation on those vectors seg then go to next space
+ # inter-codon, each row correspond to a space inter-stop codon
+ if (frame == 1)
+ {
+ test1 <- t.test(seg1, seg2)$p.value
+ test2 <- t.test(seg1, seg3)$p.value
+ }
+ if (frame == 2)
+ {
+ test1 <- t.test(seg2, seg1)$p.value
+ test2 <- t.test(seg2, seg3)$p.value
+ }
+ if (frame == 3)
+ {
+ test1 <- t.test(seg3, seg1)$p.value
+ test2 <- t.test(seg3, seg2)$p.value
+ }
+ if (test1 < level & test2 < level)
+ {
+ result <- 1
+ }
+ else
+ {
+ result <- 0
+ }
+ tabCDS <- c(tabCDS, vstop[i-1]+3, vstop[i]+2, mean(seg1), mean(seg2), mean(seg3), test1, test2, result)
+ j <- j + 1
+ }
+ }
+ tabCDS <- matrix(tabCDS, nrow = j, ncol = 8, byrow = TRUE) # conversion list to table
+ return(tabCDS)
+ }
+ seqisize <- floor((dim(recd$li)[1])/3) # number of window by reading frame, we take the integer part
+ if ((dim(recd$li)[1])%%3 == 1) # adaptation of number of window between each reading frame
+ {
+ seqisize1 <- seqisize + 1 # for fr1
+ seqisize2 <- seqisize # for fr2
+ }
+ if ((dim(recd$li)[1])%%3 == 2)
+ {
+ seqisize1 <- seqisize + 1
+ seqisize2 <- seqisize + 1
+ }
+ if ((dim(recd$li)[1])%%3 == 0)
+ {
+ seqisize1 <- seqisize
+ seqisize2 <- seqisize
+ }
+ ##
+ ##direct strand##
+ ##
+ if (direct)
+ {
+ vstopdindphase <- numeric()
+ if (length(vstopd) > 0)
+ { # test if vector is not empty because problem with modulo
+ vstopdindphase <- sapply(1:length(vstopd), function(x) { # index vector of reading frame of vector vstopd
+ if (vstopd[x]%%3 == 1)
+ {
+ vstopdindphase <- c(vstopdindphase, 1)
+ }
+ else
+ {
+ if (vstopd[x]%%3 == 2)
+ {
+ vstopdindphase <- c(vstopdindphase, 2)
+ }
+ else
+ {
+ vstopdindphase <- c(vstopdindphase, 3)
+ }
+ }
+ })
+ }
+ vstop1 <- vstopd[vstopdindphase == 1] # vector with only stop codons in reading frame 1
+ vstop2 <- vstopd[vstopdindphase == 2] # vector with only stop codons in reading frame 2
+ vstop3 <- vstopd[vstopdindphase == 3] # vector with only stop codons in reading frame 3
+ vstop1 <- c(vstop1, 1-3, seqsize-(seqsize%%3)-2) # add start and end positions, "-3" and "-2" because of table.recstat()
+ vstop2 <- c(vstop2, 2-3, seqsize-((seqsize-1)%%3)-2)
+ vstop3 <- c(vstop3, 3-3, seqsize-((seqsize-2)%%3)-2)
+ vstop1 <- sort(unique(vstop1)) # sort of the vector
+ vstop2 <- sort(unique(vstop2))
+ vstop3 <- sort(unique(vstop3))
+ tab1 <- table.recstat(vstop1, recd, 1)
+ colnames(tab1) <- c("Start", "End", "Mean 1", "Mean 2", "Mean 3", "t(1,2)", "t(1,3)", "CDS")
+ tab2 <- table.recstat(vstop2, recd, 2)
+ colnames(tab2) <- c("Start", "End", "Mean 1", "Mean 2", "Mean 3", "t(2,1)", "t(2,3)", "CDS")
+ tab3 <- table.recstat(vstop3, recd, 3)
+ colnames(tab3) <- c("Start", "End", "Mean 1", "Mean 2", "Mean 3", "t(3,1)", "t(3,2)", "CDS")
+ return(list(tab1, tab2, tab3))
+ }
+ ##
+ ##reverse strand##
+ ##
+ if (!direct)
+ {
+ vstoprindphase <- numeric()
+ if (length(vstopr) > 0)
+ {
+ vstoprindphase <- sapply(1:length(vstopr), function(x)
+ {
+ if (vstopr[x]%%3 == 1)
+ {
+ vstoprindphase <- c(vstoprindphase, 1)
+ }
+ else
+ {
+ if (vstopr[x]%%3 == 2)
+ {
+ vstoprindphase <- c(vstoprindphase, 2)
+ }
+ else
+ {
+ vstoprindphase <- c(vstoprindphase, 3)
+ }
+ }
+ })
+ }
+ vstop1 <- vstopr[vstoprindphase == 1]
+ vstop2 <- vstopr[vstoprindphase == 2]
+ vstop3 <- vstopr[vstoprindphase == 3]
+ vstop1 <- c(vstop1, 1-3, seqsize-(seqsize%%3)-2) # add start and end positions
+ vstop2 <- c(vstop2, 2-3, seqsize-((seqsize-1)%%3)-2)
+ vstop3 <- c(vstop3, 3-3, seqsize-((seqsize-2)%%3)-2)
+ vstop1 <- sort(unique(vstop1))
+ vstop2 <- sort(unique(vstop2))
+ vstop3 <- sort(unique(vstop3))
+ tab1 <- table.recstat(vstop1, recr, 1)
+ colnames(tab1) <- c("Start", "End", "Mean 1", "Mean 2", "Mean 3", "t(1,2)", "t(1,3)", "CDS")
+ tab2 <- table.recstat(vstop2, recr, 2)
+ colnames(tab2) <- c("Start", "End", "Mean 1", "Mean 2", "Mean 3", "t(2,1)", "t(2,3)", "CDS")
+ tab3 <- table.recstat(vstop3, recr, 3)
+ colnames(tab3) <- c("Start", "End", "Mean 1", "Mean 2", "Mean 3", "t(3,1)", "t(3,2)", "CDS")
+ return(list(tab1, tab2, tab3))
+ }
+}
+
Property changes on: pkg/R/test.li.recstat.R
___________________________________________________________________
Added: svn:executable
+ *
Added: pkg/inst/sequences/ECOUNC.fsa
===================================================================
--- pkg/inst/sequences/ECOUNC.fsa (rev 0)
+++ pkg/inst/sequences/ECOUNC.fsa 2011-09-20 12:59:55 UTC (rev 1761)
@@ -0,0 +1,133 @@
+>ECOUNC
+aaagcaaataaaatttaatttttatcaaaaaaatcataaaaaattgaccggttagactgt
+taacaacaaccaggttttctactgatataactggttacatttaacgccacgttcactctt
+ttgcatcaacaagataacgtggctttttttggtaagcagaaaataagtcattagtgaaaa
+tatcagtctgctaaaaatcggcgctaagaaccatcattggctgttaaaacattattaaaa
+atgtcaatgggtggtttttgttgtgtaaatgtcatttattaaaacagtatctgtttttag
+actgaaatatcataaacttgcaaaggcatcatttgccaagtaaataaatatgctgtgcgc
+gaacatgcgcaatatgtgatctgaagcacgctttatcaccagtgtttacgcgttatttac
+agtttttcatgatcgaacagggttagcagaaaagtcgcaattgtatgcactggaaaaata
+tttaaacatttattcaccttttggctacttattgtttgaaatcacgggggcgcaccgtat
+aatttgaccgctttttgatgcttgactctaagccttaaagaaagttttatacgacacgcg
+gcatacctcgaagggagcaggagtgaaaaacgtgatgtctgtgtcgctcgtgagtcgaaa
+cgttgctcggaagcttctgctcgttcagttactggtggtgatagcaagtggattgctgtt
+cagcctcaaagaccccttctggggcgtctctgcaataagcgggggcctggcagtctttct
+gcctaacgttttgtttatgatatttgcctggcgtcaccaggcgcatacaccagcgaaagg
+ccgggtggcctggacattcgcatttggcgaagctttcaaagttctggcgatgttggtgtt
+actggtggtggcgttggcggttttaaaggcggtattcttgccgctgatcgttacgtgggt
+tttggtgctggtggttcagatactggcaccggctgtaattaacaacaaagggtaaaaggc
+atcatggcttcagaaaatatgacgccgcaggattacataggacaccacctgaataacctt
+cagctggacctgcgtacattctcgctggtggatccacaaaaccccccagccaccttctgg
+acaatcaatattgactccatgttcttctcggtggtgctgggtctgttgttcctggtttta
+ttccgtagcgtagccaaaaaggcgaccagcggtgtgccaggtaagtttcagaccgcgatt
+gagctggtgatcggctttgttaatggtagcgtgaaagacatgtaccatggcaaaagcaag
+ctgattgctccgctggccctgacgatcttcgtctgggtattcctgatgaacctgatggat
+ttactgcctatcgacctgctgccgtacattgctgaacatgtactgggtctgcctgcactg
+cgtgtggttccgtctgcggacgtgaacgtaacgctgtctatggcactgggcgtatttatc
+ctgattctgttctacagcatcaaaatgaaaggcatcggcggcttcacgaaagagttgacg
+ctgcagccgttcaatcactgggcgttcattcctgtcaacttaatccttgaaggggtaagc
+ctgctgtccaaaccagtttcactcggtttgcgactgttcggtaacatgtatgccggtgag
+ctgattttcattctgattgctggtctgttgccgtggtggtcacagtggatcctgaatgtg
+ccgtgggccattttccacatcctgatcattacgctgcaagccttcatcttcatggttctg
+acgatcgtctatctgtcgatggcgtctgaagaacattaatttaccaacactactacgttt
+taactgaaacaaactggagactgtcatggaaaacctgaatatggatctgctgtacatggc
+tgccgctgtgatgatgggtctggcggcaatcggtgctgcgatcggtatcggcatcctcgg
+gggtaaattcctggaaggcgcagcgcgtcaacctgatctgattcctctgctgcgtactca
+gttctttatcgttatgggtctggtggatgctatcccgatgatcgctgtaggtctgggtct
+gtacgtgatgttcgctgtcgcgtagtaagcgttgcttttatttaaagagcaatatcagaa
+cgttaactaaatagaggcattgtgctgtgaatcttaacgcaacaatcctcggccaggcca
+tcgcgtttgtcctgttcgttctgttctgcatgaagtacgtatggccgccattaatggcag
+ccatcgaaaaacgtcaaaaagaaattgctgacggccttgcttccgcagaacgagcacata
+aggaccttgaccttgcaaaggccagcgcgaccgaccagctgaaaaaagcgaaagcggaag
+cccaggtaatcatcgagcaggcgaacaaacgccgctcgcagattctggacgaagcgaaag
+ctgaggcagaacaggaacgtactaaaatcgtggcccaggcgcaggcggaaattgaagccg
+agcgtaaacgtgcccgtgaagagctgcgtaagcaagttgctatcctggctgttgctggcg
+ccgagaagatcatcgaacgttccgtggatgaagctgctaacagcgacatcgtggataaac
+ttgtcgctgaactgtaaggagggaggggctgatgtctgaatttattacggtagctcgccc
+ctacgccaaagcagcttttgactttgccgtcgaacaccaaagtgtagaacgctggcagga
+catgctggcgtttgccgccgaggtaaccaaaaacgaacaaatggcagagcttctctctgg
+cgcgcttgcgccagaaacgctcgccgagtcgtttatcgcagtttgtggtgagcaactgga
+cgaaaacggtcagaacctgattcgggttatggctgaaaatggtcgtcttaacgcgctccc
+ggatgttctggagcagtttattcacctgcgtgccgtgagtgaggctaccgctgaggtaga
+cgtcatttccgctgccgcactgagtgaacaacagctcgcgaaaatttctgctgcgatgga
+aaaacgtctgtcacgcaaagttaagctgaattgcaaaatcgataagtctgtaatggcagg
+cgttatcatccgagcgggtgatatggtcattgatggcagcgtacgcggtcgtcttgagcg
+ccttgcagacgtcttgcagtcttaaggggactggagcatgcaactgaattccaccgaaat
+cagcgaactgatcaagcagcgcattgctcagttcaatgttgtgagtgaagctcacaacga
+aggtactattgtttctgtaagtgacggtgttatccgcattcacggcctggccgattgtat
+gcagggtgaaatgatctccctgccgggtaaccgttacgctatcgcactgaacctcgagcg
+cgactctgtaggtgcggttgttatgggtccgtacgctgaccttgccgaaggcatgaaagt
+taagtgtactggccgtatcctggaagttccggttggccgtggcctgctgggccgtgtggt
+taacactctgggtgcaccaatcgacggtaaaggtccgctggatcacgacggcttctctgc
+tgtagaagcaatcgctccgggcgttatcgaacgtcagtccgtagatcagccggtacagac
+cggttataaagccgttgactccatgatcccaatcggtcgtggtcagcgtgaattgatcat
+cggtgaccgtcagacaggtaaaaccgcactggctatcgatgccatcatcaaccagcgcga
+ttccggtatcaaatgtatctatgtcgctatcggccagaaagcgtccaccatttctaacgt
+ggtacgtaaactggaagagcacggcgcactggctaacaccatcgttgtggtagcaaccgc
+gtctgaatccgctgcactgcaatacctggcacgtatgccggttgcgctaatgggcgaata
+cttccgtgaccgcggtgaagatgcgctgatcatttacgatgacctgtctaaacaggctgt
+tgcttaccgtcagatctccctgctgctccgtcgtccgccaggacgtgaagcattcccggg
+cgacgttttctacctccactctcgtctgctggagcgtgctgcacgtgttaacgccgaata
+cgttgaagccttcaccaaaggtgaagtgaaagggaaaaccggttctctgaccgcactgcc
+gattatcgaaactcaggcgggtgacgtttctgcgttcgttccgaccaacgtaatctccat
+taccgatggtcagatcttcctggaaaccaacctgttcaacgccggtattcgtcctgcggt
+taacccgggtatttccgtatcccgtgttggtggtgcagcacagaccaagatcatgaaaaa
+actgtccggtggtatccgtaccgctctggcacagtatcgtgaactggcagcgttctctca
+gtttgcatccgaccttgacgatgcaacacgtaaccagcttgaccacggtcagaaagtgac
+cgaactgctgaaacagaaacagtatgcgccgatgtccgttgcgcagcagtctctggttct
+gttcgcagcagaacgtggttacctggcggatgttgaactgtcgaaaattggcagcttcga
+agccgctctgctggcttacgtcgaccgtgatcacgctccgttgatgcaagagatcaacca
[TRUNCATED]
To get the complete diff run:
svnlook diff /svnroot/seqinr -r 1761
More information about the Seqinr-commits
mailing list