[Seqinr-commits] r1528 - pkg/R
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Wed Jan 21 10:22:28 CET 2009
Author: lobry
Date: 2009-01-21 10:22:27 +0100 (Wed, 21 Jan 2009)
New Revision: 1528
Added:
pkg/R/dia.bactgensize.R
Removed:
pkg/R/bactgensize.R
Log:
file name changed to fit fct name
Deleted: pkg/R/bactgensize.R
===================================================================
--- pkg/R/bactgensize.R 2009-01-21 07:12:32 UTC (rev 1527)
+++ pkg/R/bactgensize.R 2009-01-21 09:22:27 UTC (rev 1528)
@@ -1,208 +0,0 @@
-dia.bactgensize <- function(
- fit = 2, p = 0.5, m1 = 2000, sd1 = 600, m2 = 4500, sd2 = 1000, p3 = 0.05,
- m3 = 9000, sd3 = 1000, maxgensize = 20000,
- source = c(system.file("sequences/goldtable15Dec07.txt", package = "seqinr"),
- "http://www.genomesonline.org/DBs/goldtable.txt"))
-{
-#
-# Use local source by default:
-#
- source <- source[1]
-#
-# Build source of data string:
-#
- if(source == system.file("sequences/goldtable15Dec07.txt", package = "seqinr")){
- sodtxt <- "Source of data: GOLD (Genomes OnLine Database) 15 Dec 2007"
- } else {
- sodtxt <- paste("Source of data: GOLD (Genomes OnLine Database)",date())
- }
-#
-# Read data from GOLD:
-#
- alldata <- read.table(source, header = TRUE, sep = "\t",
- comment.char = "", quote = "")
- SUPERKINGDOM <- 1 # col number
- kingdom <- alldata[, SUPERKINGDOM]
- prodata <- alldata[ kingdom == "Archaea" | kingdom == "Bacteria", ]
- data <- prodata[, c("GENUS", "SPECIES", "SIZE.kb.")]
- names(data) <- c("genus", "species", "gs")
- data <- data[complete.cases(data), ]
-#
-# Remove data > maxgensize:
-#
- data <- data[data$gs <= maxgensize, ]
-#
-# Use Kb scale
-#
- sizeKb <- data$gs
- n <- length(sizeKb)
-#
-# Graphics
-#
- x <- seq( min(sizeKb), max(sizeKb), le=200)
- mybreaks <- seq(min(sizeKb),max(sizeKb),length=15)
- vscale <- diff(mybreaks)[1]*n
-
- if(fit == 0)
- {
- hst <- hist(sizeKb, freq = TRUE,
- breaks = mybreaks,
- main=paste("Genome size distribution for", n, "bacterial genomes"),
- xlab="Genome size [Kb]",
- ylab="Genome count",
- col="lightgrey")
-
- dst <- density(sizeKb)
- lines(x=dst$x, y=vscale*dst$y)
- legend(x = max(sizeKb)/2, y = 0.75*max(hst$counts), lty=1,
- "Gaussian kernel density estimation")
-
- mtext(sodtxt)
- }
-##########################################
-#
-# Fitting a mixture of two normal distributions
-#
-###########################################
- if(fit == 2)
- {
- logvraineg <- function(param, obs)
- {
- p <- param[1]
- m1 <- param[2]
- sd1 <- param[3]
- m2 <- param[4]
- sd2 <- param[5]
-
- -sum(log(p*dnorm(obs,m1,sd1)+(1-p)*dnorm(obs,m2,sd2)))
- }
-
- nlmres <- suppressWarnings(nlm(logvraineg, c(p, m1, sd1, m2, sd2), obs=sizeKb))
- estimate <- nlmres$estimate
-
- y1 <- vscale*estimate[1]*dnorm(x, estimate[2], estimate[3])
- y2 <- vscale*(1-estimate[1])*dnorm(x, estimate[4], estimate[5])
- dst <- density(sizeKb)
-
- hst <- hist(sizeKb, plot = FALSE, breaks = mybreaks)
- ymax <- max(y1, y2, hst$counts, vscale*dst$y)
-
-
- hist(sizeKb, freq = TRUE, ylim=c(0,ymax),
- breaks = mybreaks,
- main=paste("Genome size distribution for", n, "bacterial genomes"),
- xlab="Genome size [Kb]",
- ylab="Genome count",
- col="lightgrey")
-
- lines(x, y1, col="red", lwd=2)
- lines(x, y2, col="blue", lwd=2)
-
- text(x = max(sizeKb)/2, y = ymax, pos=4, "Maximum likelihood estimates:")
-
- text(x = max(sizeKb)/2, y = 0.95*ymax, col="red", pos = 4, cex=1.2,
- substitute(hat(p)[1] == e1~~hat(mu)[1] == e2~~hat(sigma)[1] == e3,
- list(e1 = round(estimate[1],3),
- e2 = round(estimate[2],1),
- e3 = round(estimate[3],1))))
-
- text(x = max(sizeKb)/2, y = 0.90*ymax, col="blue", pos = 4, cex=1.2,
- substitute(hat(p)[2] == q~~hat(mu)[2] == e4~~hat(sigma)[2] == e5,
- list(q = round(1 - estimate[1],3),
- e4 = round(estimate[4],1),
- e5 = round(estimate[5],1))))
-
- lines(x=dst$x, y=vscale*dst$y)
- legend(x = max(sizeKb)/2, y = 0.75*ymax, lty=1,
- "Gaussian kernel density estimation")
-
- mtext(sodtxt)
- }
-##########################################
-#
-# Fitting a mixture of three normal distributions
-#
-###########################################
- if(fit == 3)
- {
- logvraineg <- function(param, obs)
- {
- p <- param[1]
- m1 <- param[2]
- sd1 <- param[3]
- m2 <- param[4]
- sd2 <- param[5]
- p3 <- param[6]
- m3 <- param[7]
- sd3 <- param[8]
-
- -sum(log(p*dnorm(obs,m1,sd1)
- +(1-p-p3)*dnorm(obs,m2,sd2)
- +p3*dnorm(obs,m3,sd3)))
- }
-
- nlmres <- suppressWarnings(nlm(logvraineg, c(p, m1, sd1, m2, sd2, p3, m3, sd3), obs=sizeKb))
- estimate <- nlmres$estimate
-
- y1 <- vscale*estimate[1]*dnorm(x, estimate[2], estimate[3])
- y2 <- vscale*(1-estimate[1]-estimate[6])*dnorm(x, estimate[4], estimate[5])
- y3 <- vscale*estimate[6]*dnorm(x, estimate[7], estimate[8])
-
- hst <- hist(sizeKb, plot = FALSE, breaks = mybreaks)
- ymax <- max(y1, y2, y3, hst$counts)
-
- hist(sizeKb, freq = TRUE, ylim=c(0,ymax),
- breaks = mybreaks,
- main=paste("Genome size distribution for", n, "bacterial genomes"),
- xlab="Genome size [Kb]",
- ylab="Genome count",
- col="lightgrey")
-
- lines(x, y1, col="red", lwd=2)
- lines(x, y2, col="blue", lwd=2)
- lines(x, y3, col="green3", lwd=2)
-
- text(x = max(sizeKb)/2, y = ymax, pos=4, "Maximum likelihood estimates:")
-
- text(x = max(sizeKb)/2, y = 0.95*ymax, col="red", pos = 4, cex=1.2,
- substitute(hat(p)[1] == e1~~hat(mu)[1] == e2~~hat(sigma)[1] == e3,
- list(e1 = round(estimate[1],3),
- e2 = round(estimate[2],1),
- e3 = round(estimate[3],1))))
-
- text(x = max(sizeKb)/2, y = 0.90*ymax, col="blue", pos = 4, cex=1.2,
- substitute(hat(p)[2] == q~~hat(mu)[2] == e4~~hat(sigma)[2] == e5,
- list(q = round(1 - estimate[1]-estimate[6],3),
- e4 = round(estimate[4],1),
- e5 = round(estimate[5],1))))
-
- text(x = max(sizeKb)/2, y = 0.85*ymax, col="green3", pos = 4, cex=1.2,
- substitute(hat(p)[3] == p3~~hat(mu)[3] == e7~~hat(sigma)[3] == e8,
- list(p3 = round(estimate[6],3),
- e7 = round(estimate[7],1),
- e8 = round(estimate[8],1))))
-
- dst <- density(sizeKb)
- lines(x=dst$x, y=vscale*dst$y)
- legend(x = max(sizeKb)/2, y = 0.75*ymax, lty=1,
- "Gaussian kernel density estimation")
-
- mtext(sodtxt)
- }
-#
-# Return invisibly the dataset
-#
- invisible(data)
-}
-
-
-
-
-
-
-
-
-
-
-
-
Added: pkg/R/dia.bactgensize.R
===================================================================
--- pkg/R/dia.bactgensize.R (rev 0)
+++ pkg/R/dia.bactgensize.R 2009-01-21 09:22:27 UTC (rev 1528)
@@ -0,0 +1,208 @@
+dia.bactgensize <- function(
+ fit = 2, p = 0.5, m1 = 2000, sd1 = 600, m2 = 4500, sd2 = 1000, p3 = 0.05,
+ m3 = 9000, sd3 = 1000, maxgensize = 20000,
+ source = c(system.file("sequences/goldtable15Dec07.txt", package = "seqinr"),
+ "http://www.genomesonline.org/DBs/goldtable.txt"))
+{
+#
+# Use local source by default:
+#
+ source <- source[1]
+#
+# Build source of data string:
+#
+ if(source == system.file("sequences/goldtable15Dec07.txt", package = "seqinr")){
+ sodtxt <- "Source of data: GOLD (Genomes OnLine Database) 15 Dec 2007"
+ } else {
+ sodtxt <- paste("Source of data: GOLD (Genomes OnLine Database)",date())
+ }
+#
+# Read data from GOLD:
+#
+ alldata <- read.table(source, header = TRUE, sep = "\t",
+ comment.char = "", quote = "")
+ SUPERKINGDOM <- 1 # col number
+ kingdom <- alldata[, SUPERKINGDOM]
+ prodata <- alldata[ kingdom == "Archaea" | kingdom == "Bacteria", ]
+ data <- prodata[, c("GENUS", "SPECIES", "SIZE.kb.")]
+ names(data) <- c("genus", "species", "gs")
+ data <- data[complete.cases(data), ]
+#
+# Remove data > maxgensize:
+#
+ data <- data[data$gs <= maxgensize, ]
+#
+# Use Kb scale
+#
+ sizeKb <- data$gs
+ n <- length(sizeKb)
+#
+# Graphics
+#
+ x <- seq( min(sizeKb), max(sizeKb), le=200)
+ mybreaks <- seq(min(sizeKb),max(sizeKb),length=15)
+ vscale <- diff(mybreaks)[1]*n
+
+ if(fit == 0)
+ {
+ hst <- hist(sizeKb, freq = TRUE,
+ breaks = mybreaks,
+ main=paste("Genome size distribution for", n, "bacterial genomes"),
+ xlab="Genome size [Kb]",
+ ylab="Genome count",
+ col="lightgrey")
+
+ dst <- density(sizeKb)
+ lines(x=dst$x, y=vscale*dst$y)
+ legend(x = max(sizeKb)/2, y = 0.75*max(hst$counts), lty=1,
+ "Gaussian kernel density estimation")
+
+ mtext(sodtxt)
+ }
+##########################################
+#
+# Fitting a mixture of two normal distributions
+#
+###########################################
+ if(fit == 2)
+ {
+ logvraineg <- function(param, obs)
+ {
+ p <- param[1]
+ m1 <- param[2]
+ sd1 <- param[3]
+ m2 <- param[4]
+ sd2 <- param[5]
+
+ -sum(log(p*dnorm(obs,m1,sd1)+(1-p)*dnorm(obs,m2,sd2)))
+ }
+
+ nlmres <- suppressWarnings(nlm(logvraineg, c(p, m1, sd1, m2, sd2), obs=sizeKb))
+ estimate <- nlmres$estimate
+
+ y1 <- vscale*estimate[1]*dnorm(x, estimate[2], estimate[3])
+ y2 <- vscale*(1-estimate[1])*dnorm(x, estimate[4], estimate[5])
+ dst <- density(sizeKb)
+
+ hst <- hist(sizeKb, plot = FALSE, breaks = mybreaks)
+ ymax <- max(y1, y2, hst$counts, vscale*dst$y)
+
+
+ hist(sizeKb, freq = TRUE, ylim=c(0,ymax),
+ breaks = mybreaks,
+ main=paste("Genome size distribution for", n, "bacterial genomes"),
+ xlab="Genome size [Kb]",
+ ylab="Genome count",
+ col="lightgrey")
+
+ lines(x, y1, col="red", lwd=2)
+ lines(x, y2, col="blue", lwd=2)
+
+ text(x = max(sizeKb)/2, y = ymax, pos=4, "Maximum likelihood estimates:")
+
+ text(x = max(sizeKb)/2, y = 0.95*ymax, col="red", pos = 4, cex=1.2,
+ substitute(hat(p)[1] == e1~~hat(mu)[1] == e2~~hat(sigma)[1] == e3,
+ list(e1 = round(estimate[1],3),
+ e2 = round(estimate[2],1),
+ e3 = round(estimate[3],1))))
+
+ text(x = max(sizeKb)/2, y = 0.90*ymax, col="blue", pos = 4, cex=1.2,
+ substitute(hat(p)[2] == q~~hat(mu)[2] == e4~~hat(sigma)[2] == e5,
+ list(q = round(1 - estimate[1],3),
+ e4 = round(estimate[4],1),
+ e5 = round(estimate[5],1))))
+
+ lines(x=dst$x, y=vscale*dst$y)
+ legend(x = max(sizeKb)/2, y = 0.75*ymax, lty=1,
+ "Gaussian kernel density estimation")
+
+ mtext(sodtxt)
+ }
+##########################################
+#
+# Fitting a mixture of three normal distributions
+#
+###########################################
+ if(fit == 3)
+ {
+ logvraineg <- function(param, obs)
+ {
+ p <- param[1]
+ m1 <- param[2]
+ sd1 <- param[3]
+ m2 <- param[4]
+ sd2 <- param[5]
+ p3 <- param[6]
+ m3 <- param[7]
+ sd3 <- param[8]
+
+ -sum(log(p*dnorm(obs,m1,sd1)
+ +(1-p-p3)*dnorm(obs,m2,sd2)
+ +p3*dnorm(obs,m3,sd3)))
+ }
+
+ nlmres <- suppressWarnings(nlm(logvraineg, c(p, m1, sd1, m2, sd2, p3, m3, sd3), obs=sizeKb))
+ estimate <- nlmres$estimate
+
+ y1 <- vscale*estimate[1]*dnorm(x, estimate[2], estimate[3])
+ y2 <- vscale*(1-estimate[1]-estimate[6])*dnorm(x, estimate[4], estimate[5])
+ y3 <- vscale*estimate[6]*dnorm(x, estimate[7], estimate[8])
+
+ hst <- hist(sizeKb, plot = FALSE, breaks = mybreaks)
+ ymax <- max(y1, y2, y3, hst$counts)
+
+ hist(sizeKb, freq = TRUE, ylim=c(0,ymax),
+ breaks = mybreaks,
+ main=paste("Genome size distribution for", n, "bacterial genomes"),
+ xlab="Genome size [Kb]",
+ ylab="Genome count",
+ col="lightgrey")
+
+ lines(x, y1, col="red", lwd=2)
+ lines(x, y2, col="blue", lwd=2)
+ lines(x, y3, col="green3", lwd=2)
+
+ text(x = max(sizeKb)/2, y = ymax, pos=4, "Maximum likelihood estimates:")
+
+ text(x = max(sizeKb)/2, y = 0.95*ymax, col="red", pos = 4, cex=1.2,
+ substitute(hat(p)[1] == e1~~hat(mu)[1] == e2~~hat(sigma)[1] == e3,
+ list(e1 = round(estimate[1],3),
+ e2 = round(estimate[2],1),
+ e3 = round(estimate[3],1))))
+
+ text(x = max(sizeKb)/2, y = 0.90*ymax, col="blue", pos = 4, cex=1.2,
+ substitute(hat(p)[2] == q~~hat(mu)[2] == e4~~hat(sigma)[2] == e5,
+ list(q = round(1 - estimate[1]-estimate[6],3),
+ e4 = round(estimate[4],1),
+ e5 = round(estimate[5],1))))
+
+ text(x = max(sizeKb)/2, y = 0.85*ymax, col="green3", pos = 4, cex=1.2,
+ substitute(hat(p)[3] == p3~~hat(mu)[3] == e7~~hat(sigma)[3] == e8,
+ list(p3 = round(estimate[6],3),
+ e7 = round(estimate[7],1),
+ e8 = round(estimate[8],1))))
+
+ dst <- density(sizeKb)
+ lines(x=dst$x, y=vscale*dst$y)
+ legend(x = max(sizeKb)/2, y = 0.75*ymax, lty=1,
+ "Gaussian kernel density estimation")
+
+ mtext(sodtxt)
+ }
+#
+# Return invisibly the dataset
+#
+ invisible(data)
+}
+
+
+
+
+
+
+
+
+
+
+
+
More information about the Seqinr-commits
mailing list