[Seqinr-commits] r1469 - pkg/inst/doc/src/mainmatter

Sat Oct 11 18:49:39 CEST 2008

Author: lobry
Date: 2008-10-11 18:49:39 +0200 (Sat, 11 Oct 2008)
New Revision: 1469

Modified:
   pkg/inst/doc/src/mainmatter/risa.rnw
Log:
updating new chapter

Modified: pkg/inst/doc/src/mainmatter/risa.rnw
===================================================================

--- pkg/inst/doc/src/mainmatter/risa.rnw	2008-10-11 13:01:18 UTC (rev 1468)
+++ pkg/inst/doc/src/mainmatter/risa.rnw	2008-10-11 16:49:39 UTC (rev 1469)
@@ -49,11 +49,15 @@
 
 \section{Finding a primer location}
 
-We want to fing a substring allowing for mismatches (say 3). Let's
+We want to fing a substring allowing for mismatches (say 3)
+but no indels\footnote{It would be better to code this as
+a regular expression to use standard tools but I don't know how 
+to do this.}. Let's
 write a function for this. Here we just use a moving window to count the
 number of matches for all positions and return the one with the
 maximum value. If the maximum number of matches if not enough, \texttt{NA}
-is returned instead.
+is returned instead. In the verbose the function produces a plot
+to check that everything is OK.
 
 <<findamo,fig=F>>=
 find.amo <- function(amo, myseq, verbose = FALSE, nmiss = 3){
@@ -78,10 +82,11 @@
 
 <<essai,fig=T,width=8,height=4>>=
 c2s(sample(s2c("acgt"), 500, rep=T))->rseq
-find.amo(amo1,rseq,verb=T)
+find.amo(amo1,rseq, verbose = TRUE)
 @ 
 
-Now insert a perfect target at position 100 in this random sequence:
+Now insert a perfect target for the first primer at position 100 in this random sequence
+to check that everything is OK :
 
 <<essai2,fig=T,width=8,height=4>>=
 substr(rseq,100,100+nchar(amo1)) <- amo1
@@ -143,44 +148,96 @@
 by its name (\texttt{mnemo} in the code thereafter).
 There could be more than one rRNA operon in the sequence fragment
 but there should be the same number of 16S and 23S genes.
-There is a maximum length to avoid problems when genes are
+There is a maximum length to the 16S-23S segemnt to avoid problems when genes are
 not annotated in consecutive order, in this case \texttt{NA}
 is returned. The default maximum length of 10 kb is conservative,
 the maximum observed value is 1.5 kb (\textit{cf} Fig. \ref{fig1RanjardL2000}),
 some post-processing of the results is most likely necessary to remove outliers.
+In case of problem during the query process the value \texttt{-Inf} is
+returned to denote this.
 
-<<mn2risa,fig=F>>=
+<<mn2risa,fig=F,keep.source=T>>=
 mn2risa <- function(mnemo, amo1, amo2, maxlength = 10000, verbose = FALSE){
-	  if(verbose) print(paste("mn2risa -->", mnemo))
-	  query("frag", paste("N=", mnemo))
-	  query("frag16S", "frag ET T=RRNA ET K=16S@")
-	  if(verbose) print(paste("n 16S = ", frag16S$nelem))
-	  query("frag23S", "frag ET T=RRNA ET K=23S@")
-     if(verbose) print(paste("n 23S = ", frag23S$nelem))
-     if(frag16S$nelem != frag23S$nelem) return(NA)
-     
-	  n <- frag16S$nelem
-	  loc1 <- getLocation(frag16S)
-	  loc2 <- getLocation(frag23S)
-	  risa <- numeric(n) 
-	  	for(i in seq_len(n)){
-		coords <- c(loc1[[i]], loc2[[i]])
-		if(coords[1] < coords[3]){
-		  forward <- TRUE
-		  if(verbose) print("forward")
-		} else {
-		  forward <- FALSE	
-		  if(verbose) print("bacward")
-		}
-		if(verbose) print(coords)
-		xmin <- min(coords)
-		xmax <- max(coords)
-		if(xmax - xmin > maxlength) return(NA)
-		myseq <- as.character(getFrag(frag$req[[1]], xmin, xmax))
-		if(verbose) print(paste("nchar myseq = ", nchar(myseq)))
-     	risa[i] <- risa.length(myseq, amo1, amo2, forward, verbose = F)$res
+  if(verbose) print(paste("mn2risa -->", mnemo))
+  #
+  # Make a list on server with the requested entry name:
+  #
+  try.res <- try(query("frag", paste("N=", mnemo)))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  #
+  # From this make a list with all subsequences that are rRRA genes
+  # with a keyword containing 16S anywhere in it:
+  #
+  try.res <- try(query("frag16S", "frag ET T=RRNA ET K=@16S@"))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  if(verbose) print(paste("n 16S = ", frag16S$nelem))
+  #
+  # The same but with 23S anywhere in keywords:
+  #
+  try.res <- query("frag23S", "frag ET T=RRNA ET K=@23S@")
+  if(verbose) print(paste("n 23S = ", frag23S$nelem))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  #
+  # We want the same number of 16S and 23S rRNA in the entry:
+  #
+  if(frag16S$nelem != frag23S$nelem) return(NA)
+  #
+  # We retrieve the location of all 16S and 23S rRNA in this genbank entry:
+  #
+  try.res <- try(loc16S <- getLocation(frag16S))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  try.res <- try(loc23S <- getLocation(frag23S))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  #
+  # The result is a vector with as many elements as rRNA operons
+  #
+  n <- frag16S$nelem
+  risa <- numeric(n)
+  #
+  # We loop now over all operons:
+  #
+  for(i in seq_len(n)){
+    coord.16S <- loc16S[[i]]
+    coord.23S <- loc23S[[i]]
+    #
+    # Test if the genes are in the forward or reverse strand:
+    #
+    if(coord.16S[1] < coord.23S[1]){
+      forward <- TRUE
+      if(verbose) print("forward")
+    } else {
+      forward <- FALSE	
+      if(verbose) print("bacward")
     }
-    return(risa)    
+    if(verbose) print(paste("16S at", coord.16S[1], coord.16S[2], "23S at", coord.23S[1], coord.23S[2]))
+    #
+    # Check that our operon is not too long:
+    #
+    xmin <- min(coord.16S, coord.23S)
+    xmax <- max(coord.16S, coord.23S)
+    if(xmax - xmin > maxlength){
+      warning(paste("Operon too long found, NA returned", mnemo, i))
+      risa[i] <- NA
+      next
+    }
+    #
+    # Get just the sequence of the operon from the genbank entry. This
+    # is the only place where we are retrieving sequence data. This
+    # return an objet of class SeqFrag that we cast into a simple
+    # character string.
+    #
+    try.res <- try(myseq <- as.character(getFrag(frag$req[[1]], xmin, xmax)))
+    if(inherits(try.res, "try-error")){
+      risa[i] <- -Inf
+      next
+    }
+    if(verbose) print(paste("nchar myseq = ", nchar(myseq)))
+    #
+    # Compute the IGS length on this operon
+    #
+    risa[i] <- risa.length(myseq, amo1, amo2, forward, verbose = F)$res
+  }
+  return(risa)    
 }
 @ 
 
@@ -192,7 +249,7 @@
 @ 
 
 Example with a fragment with seven 16S and seven 23S genes,
-the seven RISA length are returned :
+the seven IGS lengths are returned :
 
 <<AE005174,fig=F,eval=F>>=
 mn2risa("AE005174", amo1, amo2,verb=T)
@@ -203,51 +260,113 @@
 We could work in fact at any taxonomical level, but suppose here that
 we are interested by the species level. All we have to do is to find
 the list of fragment where there is at least one 16S and one 23S gene.
-We use here all the power of ACNUC query language. Note that species
-names may contain special characters so that we protect it here with
-quotes.
+We use here all the power of ACNUC query language. 
 
-<<sp2risa>>=
+
+<<sp2risa,fig=F,keep.source=T>>=
 sp2risa <- function(sp, amo1, amo2, verbose = TRUE){
-	if(verbose) print(paste("sp2risa -->", sp))
-	# protect with quotes
-	query("cursp", paste("\"sp=", sp, "\"", sep=""), virtual=TRUE)
-	query("res1", "cursp ET T=RRNA ET K=16S@", virtual=TRUE)
-	query("res1", "ME res1",virtual=TRUE)
-	query("res2","cursp ET T=RRNA ET K=23S@",virtual=TRUE)
-	query("res2","ME res2",virtual=TRUE)
-	query("res3", "res1 ET res2")
-	if(verbose) print(paste("number of mother sequences = ", res3$nelem))
-	seqnames <- getName(res3)
-	result <- vector("list", res3$nelem)
-	names(result) <- seqnames
-	for(i in seq_len(res3$nelem)){
-	  result[[i]] <- mn2risa(seqnames[i], amo1, amo2, verbose = verbose)
-	}
-	return(result)
+  if(verbose) print(paste("sp2risa -->", sp))
+  #
+  # protect query with quotes, get all sequences attached the specie
+  #  
+  try.res <- try(query("cursp", paste("\"sp=", sp, "\"", sep=""), virtual=TRUE))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  #
+  # Get all 16S rRNA genes:
+  #
+  try.res <- try(query("res1", "cursp ET T=RRNA ET K=@16S@", virtual=TRUE))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  #
+  # Replace by mother sequences:
+  #
+  try.res <- try(query("res1", "ME res1", virtual=TRUE))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  #
+  # Get all 23S rRNA genes:
+  #
+  try.res <- try(query("res2","cursp ET T=RRNA ET K=@23S@", virtual=TRUE))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  #
+  # Replace by mother sequences:
+  #
+  try.res <- try(query("res2","ME res2",virtual=TRUE))
+  if(inherits(try.res, "try-error")) return(-Inf)
+  #
+  # Keep only sequences that contains at least one 16S and 23S:
+  #
+  try.res <- try(query("res3", "res1 ET res2"))
+  if(inherits(try.res, "try-error")) return(-Inf)
+
+  if(verbose) print(paste("number of mother sequences = ", res3$nelem))
+  seqnames <- getName(res3)
+  result <- vector("list", res3$nelem)
+  names(result) <- seqnames
+  #
+  # Loop over all sequences:
+  #
+  for(i in seq_len(res3$nelem)){
+    result[[i]] <- mn2risa(seqnames[i], amo1, amo2, verbose = verbose)
+  }
+  return(result)
 }
 @ 
 
 
 \section{Loop over many species}
 
-We loop now over all bacterial species in genbank. As this is long, we run
+\subsection{Preprocessing: select interesting species}
+
+We select bacterial species for which there is at least one
+entry with at least one 16S and one 23S gene:
+
+<<prepro,keep.source=T,eval=F>>=
+#
+# Choose a bank:
+#
+choosebank("genbank")
+#
+# Select all bacterial sequences with 23S:
+#
+query("allbact", "SP=bacteria ET T=RRNA ET K=@23S@", virtual = TRUE)
+#
+# Replace by mother sequences:
+#
+query("allbact", "ME allbact", virtual = TRUE)
+#
+# Look for 16S in them:
+#
+query("allbact", "allbact ET T=RRNA ET K=@16S@", virtual = TRUE)
+#
+# Get species names:
+#
+query("splist", "PS allbact")
+#
+# Save them into a file:
+#
+splist <- getName(splist)
+head(splist)
+length(splist)
+save(splist, file = "splist.RData")
+@ 
+
+\subsection{Loop over our specie list}
+
+We loop now over our specie list. As this is long, we run
 it overnight in batch, saving results on the fly to spy them.
 When the species name is a single word this is most likely a genus,
 then to avoid redundancy in computation with the underlying species,
 it is not considered and a \texttt{+Inf} value is set. An empty list
 means that no fragment with both 16S and 23S genes were found. A
 missing value \texttt{NA} means that the PCR primers were not found.
+A \texttt{-Inf} value means a problem while querying the server.
 
 <<mainloop,fig=F,eval=F>>=
-query("all", "sp=bacteria", virtual=TRUE)
-query("allsp", "ps all")
-splist <- getName(allsp)
+load("splist.RData")
 resultat <- vector("list", length(splist))
 names(resultat) <- splist
 i <- 1
 for(sp in splist){
-  print(paste("===>",sp))
+  print(paste("===>", sp))
   if(length(unlist(strsplit(sp, split= " "))) == 1){
   	  resultat[[i]] <- +Inf
   	  i <- i + 1
@@ -262,7 +381,46 @@
 
 \section{Playing with results}
 
+<<load,fig=F>>=
+load("resultat.RData")
+@ 
 
+There shouldn't be any null entries in results, except if we are
+spying them.
+
+<<anynull,fig=F>>=
+lesnull <- (unlist(lapply(resultat,is.null)))
+(nnull <- sum(lesnull))
+resultat <- resultat[!lesnull]
+@ 
+
+Show how many fragments we have by species :
+
+<<ndf,fig=F>>=
+table(unlist(lapply(resultat,length)))
+@ 
+
+How many IGS do we have there:
+
+<<brut,fig=F>>=
+brut <- unlist(resultat)
+length(brut)
+brut2 <- brut[!is.na(brut)]
+length(brut2)
+@ 
+
+<<bplot,fig=T,width=8,height=4>>=
+tab <- table(brut2)
+x <- as.numeric(unlist(dimnames(tab)))
+y <- tab
+plot(x,y, type = "h", ylim = c(0, max(y)),
+main = "Global distribution of IGS length",
+las = 1, ylab = "Count", xlab = "Size in bp")
+
+dst <- density(brut2, adj = 0.1)
+lines(dst$x, dst$y*max(y)/max(dst$y), col = "red", xpd=NA)
+@ 
+
 \SweaveInput{../config/sessionInfo.rnw}
 
 % END - DO NOT REMOVE THIS LINE