[Zooimage-commits] r245 - in pkg: mlearning phytoimage zooimage zooimage/R zooimage/inst zooimage/inst/planktonSorter zooimage/man zooimageJ

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Sun Mar 2 08:52:45 CET 2014


Author: phgrosjean
Date: 2014-03-02 08:52:45 +0100 (Sun, 02 Mar 2014)
New Revision: 245

Added:
   pkg/zooimage/R/correction.R
   pkg/zooimage/R/planktonSorter.R
   pkg/zooimage/TODO
   pkg/zooimage/inst/planktonSorter/
   pkg/zooimage/inst/planktonSorter/.DS_Store
   pkg/zooimage/inst/planktonSorter/insert.gif
   pkg/zooimage/inst/planktonSorter/jquery-1.11.0.min.js
   pkg/zooimage/inst/planktonSorter/planktonSorter.css
   pkg/zooimage/inst/planktonSorter/planktonSorter.js
   pkg/zooimage/man/correctError.Rd
Modified:
   pkg/mlearning/DESCRIPTION
   pkg/phytoimage/DESCRIPTION
   pkg/zooimage/DESCRIPTION
   pkg/zooimage/NAMESPACE
   pkg/zooimage/NEWS
   pkg/zooimage/R/gui.R
   pkg/zooimage/man/gui.Rd
   pkg/zooimage/man/zooimage.package.Rd
   pkg/zooimageJ/DESCRIPTION
Log:
Adjustment in Author vs Author at R fields in DESCRIPTION
Addition of error correction and (partial) suspects validation mechanism
Web user interface for validation (planktonSorter)

Modified: pkg/mlearning/DESCRIPTION
===================================================================
--- pkg/mlearning/DESCRIPTION	2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/mlearning/DESCRIPTION	2014-03-02 07:52:45 UTC (rev 245)
@@ -3,7 +3,12 @@
 Title: Machine learning algorithms with unified interface and confusion matrices
 Version: 1.0-0
 Date: 2012-08-04
-Author: Ph. Grosjean & K. Denis
+Author: Philippe Grosjean [aut, cre],
+  Kevin Denis [aut]
+Authors at R: c(person("Philippe", "Grosjean", role = c("aut", "cre"),
+  email = "phgrosjean at sciviews.org"),
+  person("Kevin", "Denis", role = "aut",
+  email = "kevin.denis at umons.ac.be"))
 Maintainer: Ph. Grosjean <Philippe.Grosjean at umons.ac.be>
 Depends: R (>= 2.14.0)
 Imports: stats, grDevices, class, nnet, MASS, e1071, randomForest, ipred 

Modified: pkg/phytoimage/DESCRIPTION
===================================================================
--- pkg/phytoimage/DESCRIPTION	2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/phytoimage/DESCRIPTION	2014-03-02 07:52:45 UTC (rev 245)
@@ -3,10 +3,14 @@
 Title: Analysis of numerical phytoplankton images
 Version: 3.0-0
 Date: 2012-05-07
-Author: Ph. Grosjean & K. Denis
+Author: Philippe Grosjean [aut, cre],
+  Kevin Denis [aut]
+Authors at R: c(person("Philippe", "Grosjean", role = c("aut", "cre"),
+  email = "phgrosjean at sciviews.org"),
+  person("Kevin", "Denis", role = "aut",
+  email = "kevin.denis at umons.ac.be"))
 Maintainer: Ph. Grosjean <Philippe.Grosjean at umons.ac.be>
 Depends: R (>= 2.14.0), svMisc (>= 0.9-66), svDialogs (>= 0.9-53), zooimage (>= 3.0-0)
-Suggests: zooimage
 Description: PhytoImage is a free (open source) solution for analyzing digital
 	images of phytoplankton. In combination with ImageJ, a free image analysis
 	system, it processes digital images, measures individuals, trains for

Modified: pkg/zooimage/DESCRIPTION
===================================================================
--- pkg/zooimage/DESCRIPTION	2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/zooimage/DESCRIPTION	2014-03-02 07:52:45 UTC (rev 245)
@@ -1,12 +1,17 @@
 Package: zooimage
 Type: Package
 Title: Analysis of numerical zooplankton images
-Version: 3.0-7
-Date: 2013-12-10
-Author: Ph. Grosjean, K. Denis & R. Francois
+Version: 4.0-0
+Date: 2014-02-23
+Author: Philippe Grosjean [aut, cre],
+  Kevin Denis [aut]
+Authors at R: c(person("Philippe", "Grosjean", role = c("aut", "cre"),
+  email = "phgrosjean at sciviews.org"),
+  person("Kevin", "Denis", role = "aut",
+  email = "kevin.denis at umons.ac.be"))
 Maintainer: Philippe Grosjean <Philippe.Grosjean at umons.ac.be>
 Depends: R (>= 2.14.0), svMisc (>= 0.9-67), svDialogs (>= 0.9-53), mlearning
-Imports: filehash, jpeg, png, tiff, utils
+Imports: filehash, jpeg, png, tiff, utils, digest, tools
 Suggests: rJava, mlbench
 Description: ZooImage is a free (open source) solution for analyzing digital
 	images of zooplankton. In combination with ImageJ, a free image analysis

Modified: pkg/zooimage/NAMESPACE
===================================================================
--- pkg/zooimage/NAMESPACE	2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/zooimage/NAMESPACE	2014-03-02 07:52:45 UTC (rev 245)
@@ -1,4 +1,6 @@
+import(digest)
 import(utils)
+import(tools)
 #import(tcltk)
 #import(tcltk2)
 import(svMisc)
@@ -20,6 +22,9 @@
 import(mlearning)
 #import(party)
 
+# planktonSorter
+export(correctError)
+
 # Zic
 export(zicCheck)
 
@@ -152,6 +157,7 @@
 export(processSamples)
 export(removeObjects)
 export(saveObjects)
+export(validClass)
 export(vignettesClass)
 export(viewManual)
 export(viewResults)

Modified: pkg/zooimage/NEWS
===================================================================
--- pkg/zooimage/NEWS	2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/zooimage/NEWS	2014-03-02 07:52:45 UTC (rev 245)
@@ -1,5 +1,17 @@
 = zooimage News
 
+== Changes in zooimage 4.0-0
+
+* Error correction functions added: correctError().
+
+* Plankton sorter interface to manual validation added.
+
+* New dependency on the digest package.
+
+* A new menu entry, Validate classification..., is added for validation and
+  error correction.
+
+
 == Changes in zooimage 3.0-7
 
 * readFlowCAMlst() can now read Visual Spreadsheet .lst file format 017 where

Added: pkg/zooimage/R/correction.R
===================================================================
--- pkg/zooimage/R/correction.R	                        (rev 0)
+++ pkg/zooimage/R/correction.R	2014-03-02 07:52:45 UTC (rev 245)
@@ -0,0 +1,1110 @@
+## TODO: disable next button when in last fraction
+## TODO: save temporary and final results in the zidb file
+## TODO: correct bugs with the back button
+
+## Load libraries
+#library(svMisc) #,  lib.loc = "./Libraries/V3")
+#library(svDialogs) #, lib.loc = "./Libraries/V3")
+#library(zooimage) #, lib.loc = "./Libraries/V3")
+##library(RANN) # Only function: nn2()... apparently not used here!
+#library(randomForest)
+#library(e1071)
+#library(RWeka)
+#library(C50)
+#library(ipred)
+
+#### Functions not used, but kept for possible future use ######################
+## Area under the curve
+auc <- function (x, y) {
+	if (length(x) != length(y) || length(x) < 2)
+		stop("'x' and 'y' must have same length >= 2")
+	sum(diff(x) * (y[-1] + y[-length(y)]) / 2)
+}
+
+
+#### Functions NOT used by errorCorrection(), but by examples ##################
+## TODO: corrHist() and addPoints() are used together... make it one plot() method?
+## TODO: make it S3 object with print, summry and plot methods + predict + ...
+## Function to plot size spectrum of particles.
+## Can be used for error correction of biomass (perspectives)
+corrHist <- function (data, subset, sizeParam, interval) {
+	data <- data[subset, sizeParam]
+	if (!length(data)) return(0)
+	hist(data, breaks = seq(min(data) - (2* interval), max(data) +
+		(2 * interval), interval), plot = FALSE)
+}
+
+## Add points to an histogram
+addPoints <- function (x, ...) {
+	if (!inherits(x, "histogram"))
+		stop("x must be an object of class 'histogram'")
+	points(x = x$mids[x$counts > 0], y = x$counts[x$counts > 0], ...)
+}
+
+## Calculation of the residual error based on global error estimation
+residError <- function (ErrorEstim, Error, Validated, na.rm = FALSE) {
+	nObs <- length(Validated)
+	if (sum(Validated) == nObs) return(0) # No error since everybody validated
+	nErr <- ErrorEstim * nObs
+	nErrVal <- sum(Error & Validated, na.rm = na.rm)
+	nErrRes <- nErr - nErrVal
+	max(nErrRes / nObs, 0) # To avoid potential negative values
+}
+
+## Evolution of residual error
+residErrorEvol <- function (ErrorEstimEvol, Error, Validated, Step,
+na.rm = FALSE) {
+	nIter <- length(ErrorEstimEvol)
+	if (nIter == 0) return(numeric(0))
+		
+	res <- numeric(nIter)
+	for (i in 1:nIter) {
+		Valid <- Validated & Step < i  #Step %in% 0:(i - 1)
+		res[i] <- residError(ErrorEstim = ErrorEstimEvol[i], Error = Error,
+			Validated = Valid, na.rm = na.rm)
+	}
+	res
+}
+
+
+#### Functions used both by errorCorrection() and by examples ##################
+## Compute the Bray-Curtis dissimilarity index on two vectors
+## Note that this is faster than vegan::vegdist for two vectors
+## and it remains faster for a vector versus a matrix. But fir all pairs,
+## use vegan::vegdist(x, method = "bray") instead
+## x must be a reference vector, and y can be a vector of same length,
+## or a matrix or a data frame with same number of rows
+dissimilarity <- function (x, y, na.rm = FALSE) {
+	if (is.data.frame(y)) y <- as.matrix(y)
+	if (!is.numeric(x) || !is.numeric(y))
+		stop("'x' and 'y' must be numeric")
+	if (!is.null(dim(x)))
+		stop("'x' must be a vector, not a matrix or an array")
+	if (is.matrix(y)) {
+		if (length(x) != nrow(y))
+			stop("'y' must have same rows as the length of 'x'")
+		
+				## The matrix calculation version
+		colSums(abs(x - y), na.rm = na.rm) /
+			colSums(rbind(y, sum(x, na.rm = na.rm)), na.rm  = na.rm)
+	
+	} else { # x and y must be vectors of same length
+		if (!is.null(dim(y)) || length(x) != length(y))
+			stop("'x' and 'y' must be two vectors of same length")
+		
+		## The simpler vector version
+		sum(abs(x - y), na.rm = na.rm) / sum(x, y, na.rm = na.rm) 
+	}
+}
+
+
+#### Functions used only by errorCorrection() ##################################
+## Calculation of the global error based on validated items
+## /!\ NOT based on items randomly selected --> Bad approximation
+.globalError <- function (suspect, error, subset, na.rm = FALSE) {
+	suspTot <- sum(suspect, na.rm = na.rm)
+	trustTot <- sum(!suspect, na.rm = na.rm)
+	vSusp <- suspect[subset]
+	vTrust <- !suspect[subset]
+	susp <- sum(vSusp, na.rm = na.rm)
+	trust <- sum(vTrust, na.rm = na.rm)
+	pSusp <- susp / suspTot
+	pTrust <- trust / trustTot
+	wTrust <- pSusp / pTrust
+	if (is.na(wTrust) || !is.finite(wTrust)) wTrust <- 1
+	vErr <- error[subset]
+	err <- sum(vErr[vSusp], na.rm = na.rm) + wTrust * sum(vErr[vTrust],
+		na.rm = na.rm)
+	tot <- susp + wTrust * trust
+	err / tot
+}
+
+## Error in each of the four fractions
+.errorInFractions <- function (suspect, error, validated, iteration = NA,
+	na.rm = FALSE) {
+  
+	## The four fractions
+	suspValIdx <- suspect & validated 
+	suspNotValIdx <- suspect & !validated
+	trustValIdx <- !suspect & validated
+	trustNotValIdx <- !suspect & !validated
+  
+	## Error in each fraction
+	errSuspValIdx <- error[suspValIdx]
+	errSuspNotValIdx <- error[suspNotValIdx]
+	errTrustValIdx <- error[trustValIdx]
+	errTrustNotValIdx <- error[trustNotValIdx]
+  
+	## Number of items in each fraction
+	nSuspVal <- sum(suspValIdx, na.rm = na.rm)
+	nSuspNotVal <- sum(suspNotValIdx, na.rm = na.rm)
+	nSuspTot <- nSuspVal + nSuspNotVal
+	nTrustVal <- sum(trustValIdx, na.rm = na.rm)
+	nTrustNotVal <- sum(trustNotValIdx, na.rm = na.rm)
+	nTrustTot <- nTrustVal + nTrustNotVal
+   
+	## Number of error in each fraction
+	nErrSuspVal <- sum(errSuspValIdx, na.rm = na.rm) 
+	nErrSuspNotVal <- sum(errSuspNotValIdx, na.rm = na.rm) 
+	nErrSuspTot <- nErrSuspVal + nErrSuspNotVal
+	nErrTrustVal <- sum(errTrustValIdx, na.rm = na.rm) 
+	nErrTrustNotVal <- sum(errTrustNotValIdx, na.rm = na.rm) 
+	nErrTrustTot <- nErrTrustVal + nErrTrustNotVal
+  
+	## Number of error in validated fraction if random distribution of the error
+	nErrSuspValRd <- nErrSuspTot / nSuspTot * nSuspVal  
+	nErrTrustValRd <- nErrTrustTot / nTrustTot * nTrustVal
+  
+	## Error rate in each fraction
+	errSuspVal <- nErrSuspVal / nSuspVal
+	errSuspNotVal <- nErrSuspNotVal / nSuspNotVal
+	errTrustVal <- nErrTrustVal / nTrustVal
+	errTrustNotVal <- nErrTrustNotVal / nTrustNotVal
+  
+	## Weight for trusted items
+	probaSusp <- nSuspVal / nSuspTot
+	probaTrust <- nTrustVal / nTrustTot
+	weightTrust <- probaSusp / probaTrust
+  
+	## Results: data frame
+	if (!all(is.na(iteration))) {
+  	## Calculation of error in validated fraction at current iteration
+		suspStepIdx <- suspect & iteration 
+		trustStepIdx <- !suspect & iteration
+		errSuspStepIdx <- error[suspStepIdx]
+		errTrustStepIdx <- error[trustStepIdx]
+		nSuspStep <- sum(suspStepIdx, na.rm = na.rm)
+		nTrustStep <- sum(trustStepIdx, na.rm = na.rm)
+		nErrSuspStep <- sum(errSuspStepIdx, na.rm = na.rm)
+		nErrTrustStep <- sum(errTrustStepIdx, na.rm = na.rm)
+		errSuspStep <- nErrSuspStep / nSuspStep
+		errTrustStep <- nErrTrustStep / nTrustStep
+
+		res <- data.frame(errSuspVal = errSuspVal, errTrustVal = errTrustVal,
+			errSuspNotVal = errSuspNotVal, errTrustNotVal = errTrustNotVal,
+			errSuspStep = errSuspStep, errTrustStep = errTrustStep, 
+			nErrSuspTot = nErrSuspTot, nErrTrustTot = nErrTrustTot,
+			nErrSuspVal = nErrSuspVal, nErrTrustVal = nErrTrustVal,
+			nErrSuspStep = nErrSuspStep, nErrTrustStep = nErrTrustStep,
+			nErrSuspValRd = nErrSuspValRd, nErrTrustValRd = nErrTrustValRd,
+			nErrSuspNotVal = nErrSuspNotVal, nErrTrustNotVal = nErrTrustNotVal,
+			nSuspTot = nSuspTot, nTrustTot = nTrustTot, nSuspVal = nSuspVal,
+			nTrustVal = nTrustVal, nSuspStep = nSuspStep,
+			nTrustStep = nTrustStep, nSuspNotVal = nSuspNotVal,
+			nTrustNotVal = nTrustNotVal, weightTrust = weightTrust)    
+	} else {
+		## Calculation of error in global sample
+		res <- data.frame(errSuspVal = errSuspVal, errTrustVal = errTrustVal,
+			errSuspNotVal = errSuspNotVal, errTrustNotVal = errTrustNotVal,
+			nErrSuspTot = nErrSuspTot, nErrTrustTot = nErrTrustTot,
+			nErrSuspVal = nErrSuspVal, nErrTrustVal = nErrTrustVal,
+			nErrSuspValRd = nErrSuspValRd, nErrTrustValRd = nErrTrustValRd,
+			nErrSuspNotVal = nErrSuspNotVal, nErrTrustNotVal = nErrTrustNotVal,
+			nSuspTot = nSuspTot, nTrustTot = nTrustTot, nSuspVal = nSuspVal,
+			nTrustVal = nTrustVal, nSuspNotVal = nSuspNotVal,
+			nTrustNotVal = nTrustNotVal, weightTrust = weightTrust)
+	}
+	res
+}
+
+## Compute probabilities that will be used for suspect detection
+classProba <- function (data, predicted = data$Predicted, classifier,
+diff.max = 0.2, prop.bio = NULL) {
+	## Get first and second identification probabilities
+	## TODO: only works for randomForest and does not use mlearning!
+	if (inherits(classifier, "randomForest")) {
+		if (inherits(classifier, "ZIClass")) {
+			data <- attr(classifier, "calc.vars")(data)
+		}
+		class(classifier) <- class(classifier)[-1]
+		prob <- predict(classifier, newdata = data, class.only = FALSE,
+			type = "membership")
+	} else stop("Suspect detection not yet implemented for this algorithm")
+	max.ids <- apply(prob, 1, sort, decreasing = TRUE)[1:2, ]
+	first <- max.ids[1, ]
+	second <- max.ids[2, ]
+
+	## 1) Coefficient quantifying quality of identification
+	identCoef <- sqrt(first * pmin((first - second) / (first * diff.max), 1))
+	
+	## 2) Probability if identification in that group
+	#first
+	
+	## 3) Difference between first and second probabilities	
+	probaDiff <- first - second
+
+	## 4) Proportion of the group (rares groups tend to have more FP!)
+	predTable <- table(predicted)
+	prop <- predTable / sum(predTable)
+	gpProp <- prop[as.character(predicted)]
+	
+	# 5) ProbaBio modulation?
+	if (is.null(prop.bio)) {
+		proba <- data.frame(IdentCoef = identCoef, TreeProp = first,
+		ProbaDiff = probaDiff, GpProp = gpProp)
+	} else {
+		bioFact <- prop.bio[as.character(predicted)]
+		proba <- data.frame(IdentCoef = identCoef, TreeProp = first,
+		ProbaDiff = probaDiff, GpProp = gpProp, BioFact = bioFact)
+	}
+	attr(proba, "IdentParam") <- list(DiffMax = diff.max,
+		ProbaBio = prop.bio, ProbaTable = prob)
+	proba
+}
+
+## Compute the proportion of false positives according to Bayes theorem
+corrProba <- function (proba, predicted, confusion, nobs) {
+	stats <- summary(confusion, type = c("Recall", "Specificity"))
+	table.pred <- table(predicted)
+	prop <- table.pred / nobs
+	recall <- stats$Recall 
+	specif <- stats$Specificity
+	fpprop <- numeric(length(table.pred))
+	for (name in names(table.pred)) {
+		prop.a <- prop[name]
+		prop.b <- 1 - prop.a
+		recall.a <- stats[name, ]$Recall
+		fprate <- 1 - stats[name, ]$Specificity
+		b.in.a <- fprate * prop.b
+		a.in.a <- recall.a * prop.a
+		fpprop[name] <- b.in.a / (b.in.a + a.in.a)
+		if (fpprop[name] == "NaN") fpprop[name] <- 1
+	}
+	proba$FP <- fpprop[as.character(predicted)]
+	## Compare this proportion to group proportion
+	proba$GpFPDiff <- proba$GpProp - proba$FP
+	proba
+}
+
+
+## Detect suspect particles using different algorithms and update corr$Suspect
+suspect <- function (data, proba, error, subset, predicted, confusion,
+algorithm = "rf", knn.k = 3, svm.kernel = "polynomial", svm.degree = 5, ...) {
+	algorithm <- match.arg(algorithm,
+		c("rf", "knn", "svm", "bayes", "c50", "glm"))
+	
+	## In the improbable case we have no error at all, consider everyone as suspect...
+	if (all(as.character(error) != "Error"))
+		return(factor(rep("Error", nrow(set)), levels = levels(error)))
+		
+	## Update proba with latest calculations, according to a corrected confusion matrix
+	proba <- corrProba(proba, predicted, confusion, nrow(data))
+	set <- cbind(data, proba, Error = error)
+	
+	## TODO: mlearning is there for complete interchangeability of algorithms
+	## thus, we should not need this switch() construct!
+	res <- switch(algorithm,
+		rf = {
+			rf.model <- mlearning(formula = Error ~ ., data = set,
+				subset = subset, method = "mlRforest", ...)    
+			susp <- predict(rf.model, newdata = set, type = "class")
+			susp[subset] <- predict(rf.model, type = "class", method = "oob")  
+		},
+		knn = {
+		#	set <- subset(set, select = -c(Error))
+		#	warning("Prediction of training set by 'knn' method without cross validation")
+		#	susp <- knn(train = set[subset, ], test = set,
+		#		cl = corr$Error[subset], k = knn.k, ...)
+		stop("Not implemented yet!")
+		},
+		svm = { 
+		#	svm.model <- mlearning(Error ~ ., data = set,
+		#		subset = subset, kernel = svm.kernel, degree = svm.degree,
+		#		method = "mlSvm", ...)
+		#	susp <- predict(svm.model, newdata = set, type = "class")
+		#	susp[subset] <- predict(svm.model, type = "class", method = "cv") 
+		stop("Not implemented yet!")
+		},
+		bayes = {
+		#	nb.model <- mlearning(Error ~ ., data = set,
+		#		subset = subset, method = "mlNaiveBayes", ...) 
+		#	susp <- predict(nb.model, newdata = set, type = "class")
+		#	susp[subset] <- predict(nb.model, type = "class", method = "cv") 
+		stop("Not implemented yet!")
+		},
+		c50 = {
+		#	C50.train <- set[subset, ]
+		#	c50.model <- C5.0(formula = Error ~ ., data = C50.train, ...)
+		#	susp <- predict(c50.model, newdata = set, type = "class")
+		#	susp[subset] <- cv(y = C50.train$Error, formula = Error ~ .,
+		#		data = C50.train, model =  C5.0, predict =
+		#		function(object, newdata) predict(object, newdata,
+		#		type = "class"), k = 10, predictions = TRUE)$predictions  
+		stop("Not implemented yet!")
+		},
+		glm =  {
+		#	glm.model <- Logistic(Error ~ ., data = set, subset = subset, ...)
+		#	warning("Prediction of training set by 'glm' method without cross-validation")
+		#	susp <- predict(glm.model, newdata = set, type = "class")
+		stop("Not implemented yet!")
+		},
+		stop("Unknown algorithm '", algorithm, "'")
+	)
+	susp
+}
+
+## Main function for error correction
+## Replace data by ZIobj and put zidb in first position... or group both items
+## and how to retrieve class then???
+## Group options in a single object too
+errorCorrection <- function (data, classifier, mode = "validation",
+fraction = 0.05, sample.min = 100, grp.min = 2, random.sample = 0.1,
+algorithm = "rf", diff.max = 0.2, prop.bio = NULL,
+zidb = NULL, testdir = NULL, id = NULL,
+result = ".last_valid", envir = parent.frame()) {
+	#### Parameters explanations
+	#### Data and classifier
+	## data -- the dataset to study
+	if (missing(data) || !inherits(data, "ZIDat"))
+		stop("data must be a ZIdat object")
+	## classifier -- the classifier to use to classify particles
+	if (missing(classifier) || !inherits(classifier, "ZIClass"))
+		stop("classifier must be a ZIClass object")
+#######	calc.vars <- attr(classifier, "calc.vars")
+	
+	#### General parameters of the iterative correction
+	## mode -- the mode (validation, stat or demo)
+	mode <- match.arg(mode, c("validation", "demo", "stat"))
+	if (mode != "validation" & is.null(data$Class))
+		 	stop("data requires a 'Class' column in this mode")
+	## fraction -- fraction of sample to validate
+	fraction <- as.numeric(fraction)[1]
+	if (fraction < 0.01 || fraction > 1)
+		stop("fraction must be between 0.01 and 1")
+	## sample.min -- minimum number of particles to validate ate each step
+	sample.min <- as.integer(sample.min)[1]
+	if (sample.min < 1)
+		stop("sample.min must be a positive integer")
+	## grp.min -- minimum number of particles of each group to validate
+	grp.min <- as.integer(grp.min)[1]
+	if (grp.min < 1 || grp.min > sample.min)
+		stop("grp.min must be a positive integer and cannot be larger than sample.min")
+	## random.sample -- fraction of random sample in the validation set
+	random.sample <- as.numeric(random.sample)[1]
+	if (random.sample < 0 || random.sample > 1)
+		stop("random.sample must be a number between 0 and 1")
+	
+	#### Parameters for the detection of suspects
+	## algorithm -- algorithm used to detect suspect particles
+	## diff.max -- maximum toalerated difference in probabilities for class identification
+	diff.max <- as.numeric(diff.max)[1]
+	if (diff.max < 0)
+		stop("diff.max must be a positive number or zero")
+	## proba.bio -- groups probabilities, using biological information
+	if (length(prop.bio) && (!is.numeric(prop.bio) || is.null(names(prop.bio))))
+		stop("prop.bio must be a named vector (groups) of numbers")
+	
+	## zidb -- path to the zidbfile to manually validate
+	## testdir -- path of the directory used for manual validation
+	
+	## Variables
+	proba <- NULL			# identification probabilities (additional vars for suspect detection)
+	corr <- NULL			# main informations about the correction
+	validated <- NULL		# validated class of the particles
+	validation.data <- NULL # Data send as validation of a given step
+	predicted <- NULL		# predictions at the beginning
+	step <- -1 				# current iteration
+	sample.size <- NULL		# size of the next subsample to validate
+	## TODO: eliminate this and get it from table(corr$Step)
+	validated.fracs <- 0 	# fraction of items validated at each step
+
+	## Manual validation variables
+	ntrusted <- NULL			# number of trusted particles detected
+	nsuspect <- NULL			# number of suspect particles detected
+	ntrusted.tovalid <- NULL 	# number of trusted particles to validate
+	nsuspect.tovalid <- NULL 	# number of suspect particles to validate
+	testset <- NULL				# subsample to validate
+	
+	## Validation of particles TODO: get rid of these two flags!
+	step.manual <- FALSE		# should the user validate particles?
+	testset.validated <- FALSE	# is the testset validated?
+	
+	## Plankton sorter variables
+	sorter.title <- NULL 		# title of the plankton sorter page
+	sorter.id <- NULL			# identifier of the plankton sorter page
+
+	## Results
+	manual.history <- NULL		# history of the manual confusion matrix
+	manual2.history <- NULL		# history of manual + 2nd ident confusion matrix
+	corr.confusion <- NULL 		# corrected confusion matrix
+	correction.history <- NULL 	# history of the correction confusion matrix
+	error.estim.data <- NULL 	# data used to estimate the error
+	error.estim <- NULL			# history of the error estimation
+	error.estim.history <- NULL # evolution of the error estimation
+	error.estim.rd <- NULL 		# error using validated, randomly selected items
+	error.estim.rd.history <- NULL # evolution of the error estimation
+	suspect.history <- list()	# history of suspect detection
+	validated.history <- list()	# history of validated particles
+	error.valid.history <- list() # history of error among validated items
+	error.suspect.history <- list() # history of error among suspect items
+	error.trusted.history <- list() # history of error among suspect items
+	nsuspect.history <- NULL 	# history of the number of suspect items
+	ntrusted.history <- NULL 	# history of the number of trusted items
+	nsuspect.tovalid.history <- NULL # history of suspect validation
+	ntrusted.tovalid.history <- NULL # history of trusted validation
+	errInFract.history <- data.frame() # evolution of the error in each fraction
+
+	## Initialize the data for error correction (data, proba, corr and others)
+	initialize <- function () {
+		## Check that this directory exists and is empty
+		if (mode != "stat") {
+			if (is.null(testdir))
+				testdir <<- file.path(tempdir(),
+					zooimage::noExtension(zidb))
+			if (file.exists(testdir)) {
+				
+				res <- dlgMessage(paste("Temporary validation directory already",
+					"exists. Do we erase old data there?"), type = "okcancel")$res
+				if (res == "cancel")
+					stop("testdir (", testdir, ") must not exist yet!")
+				unlink(testdir, recursive = TRUE)
+			}	
+			dir.create(testdir, recursive = TRUE)
+			if (!file.exists(testdir))
+				stop("cannot create 'testdir'!")
+			## Put required files there: create the planktonSorter directory
+			plSort <- file.path(testdir, "planktonSorter")
+			dir.create(plSort)
+			plFiles <- dir(system.file("planktonSorter", package = "zooimage"),
+				full.names = TRUE)
+			res <- file.copy(plFiles, file.path(plSort, basename(plFiles)))
+			if (any(!res))
+				stop("Problem when copying one or more plankton sorter files")
+		}
+		
+		## In the other modes, indicate that we prepare the validation environment
+		cat("Preparing a validation environment...\n")
+		
+		## Be sure that data is correctly ordered
+		data <<- data[order(data$Item), ]
+		## Be sure 'Id' exists in data
+		data$Id <- makeId(data)
+
+		## Make sure items' membership is predicted by the classifier
+		predicted <- data$Predicted
+		## TODO: shouldn't we do this all the time???
+		if (is.null(predicted)) # Predict class if it wasn't done yet
+			predicted <- predict(classifier, data, class.only = TRUE,
+				type = "class")
+		predicted <<- predicted
+
+#		if (mode != "validation") {
+			## Store validated items and table of validated items
+			validated <<- data$Class 
+			## TODO: why this, and Class put in validated?
+			data$Class <- NULL
+			data <<- data
+#		}
+		## Keep all data
+###		data <<- attr(classifier, "calc.vars")(data)
+###		## Do not keep uncomplete attributes
+###		data <<- data[, sapply(data, FUN = function(x) !any(is.na(x)))] 
+		proba <<- classProba(data, predicted, classifier, diff.max = diff.max,
+			prop.bio = prop.bio)
+		nobs <- nrow(data)
+		error <- factor(rep("Error", nobs), levels = c("Correct", "Error"))
+		## Compute the second possible identification
+		secondIdent <- function (groups) {
+			proba.table <- attr(proba, "IdentParam")$ProbaTable
+			proba.order <- apply(proba.table, 1, order, decreasing = TRUE)
+		
+			fst.ids <- proba.order[1, ]
+			scd.ids <- proba.order[2, ]
+		
+			proba.max <- apply(proba.table, 1, sort, decreasing = TRUE)[1, ]
+			max.ids <- proba.max == 1
+			scd.ids[max.ids] <- fst.ids[max.ids]
+		
+			factor(groups[scd.ids], levels = groups)
+		}
+		predicted2 <- secondIdent(levels(predicted))
+		
+		## Creation of corr object
+		corr <<- data.frame(Actual = predicted, Actual2 = predicted2,
+			Predicted = predicted, Predicted2 = predicted2, Validated = FALSE,
+			Error = error, Step = step, Suspect = rep(TRUE, nobs),
+			RdValidated = rep(Inf, nobs), OtherGp = rep(FALSE, nobs))
+    
+		## Confusion matrix of original classifier
+		train.validated <- attr(classifier, "response")
+		train.predicted <- attr(classifier, "cvpredict")
+		train.conf <- confusion(x = train.predicted, y = train.validated)
+		
+		## First error correction: we start with Solow et al. estimation
+		## Compute correction of abundances using a method similar to Solow et al.,
+		## but that is not stuck with singular matrices problems
+		## TODO: this takes a long time => try Solow et al first before using this!
+		correctionLab <- function (x, conf) {
+			l <- length(x)
+			if (l != ncol(conf))
+				stop("'x' has not the same length than 'conf'")
+			
+			toMat <- function(x, byrow = TRUE)
+				matrix(rep(x, l), ncol = l, byrow = byrow)
+			
+			predMat <- toMat(colSums(conf))
+			xMat <- toMat(x)
+			
+			## Remove 0
+			notZero <- function (x) {
+				if (0 %in% x) x[x==0] <- 1
+				x
+			}
+			
+			confB <- conf / notZero(predMat) * xMat
+			x2 <- rowSums(confB)
+			classMat <- toMat(rowSums(conf), byrow = FALSE)
+			while (!isTRUE(all.equal(x, x2, tolerance = 0.00001))) {
+				x <- x2
+				confA <- conf / notZero(classMat) * toMat(x2, byrow = FALSE)
+				xMat2 <- toMat(colSums(confA))
+				confB <- confA / notZero(xMat2) * xMat
+				x2 <- rowSums(confB)
+			}
+			round(x2)
+		}
+		tablePredicted <- table(predicted)
+		correction.history <<- correctionLab(x = tablePredicted,
+			conf = train.conf)
+		manual.history <<- tablePredicted
+		manual2.history <<- manual.history
+	
+		## Increment step (should be 0 now, because initial value is -1)
+		step <<- step + 1
+		## Determine the number of vignettes to manually validate
+		setSampleSize()
+	}
+
+	## Compute the size of the next subsample: update sample.size
+	setSampleSize <- function () {
+		sample.size <<- round(min(nrow(corr[!corr$Validated, ]), # How much remains?
+			## Or number of items we want to take
+			max(nrow(data) * fraction, # Items to take
+				sample.min, # Minimum items we can take
+				## TODO: check the following one!
+				grp.min * length(table(predicted))))) # Minimum from groups
+	}	
+	
+	## Determine the subsample to validate
+	## Update Step and RdValidated (used for error estimation)
+	## Automatically place vignettes in directories for manual validation
+	prepareTest <- function () {
+		nobs <- nrow(data)
+		if (step < 1) {
+			## At first time, take a random subsample
+			## Same as considering everything as suspect
+			sample.ids <- sample(1:nobs, size = sample.size)
+			corr$Step[sample.ids] <<- step
+			corr$RdValidated[sample.ids] <<- step
+			nsuspect.tovalid.history <<- c(nsuspect.tovalid.history, sample.size)
+			ntrusted.tovalid.history <<- c(ntrusted.tovalid.history, 0)
+			nsuspect.history <<- c(nsuspect.history, nobs)
+			ntrusted.history <<- c(ntrusted.history, 0)
+		} else { # Step > 0
+			## Mix trusted and suspect particles
+			#validated <- corr[corr$Validated, ]
+			notvalidated <- corr[!corr$Validated, ]
+			nsuspect <<- sum(notvalidated$Suspect)	#nrow(notvalidated[notvalidated$Suspect, ])
+			ntrusted <<- nrow(notvalidated) - nsuspect	#nrow(notvalidated[!notvalidated$Suspect, ])
+			## Determine the number of random items used in error estimation
+			numRandom  <- max(sample.size - nsuspect,
+				round(random.sample * sample.size))
+			ids <- as.character(1:nobs)
+			## All items not included in RdValidated
+			tosample.ids <- ids[is.infinite(corr$RdValidated)]
+			## Items to validate
+			randomsample.ids <- as.numeric(sample(tosample.ids,
+				size = min(numRandom, length(tosample.ids))))
+			## Used to estimate error at step i
+			corr$RdValidated[randomsample.ids] <<- step
+			newstep <- corr$Step[randomsample.ids]
+			## Except those already validated and used for error estimation
+			newstep[newstep == -1] <- step
+			corr$Step[randomsample.ids] <<- newstep
+			notvalid.ids <- ids[!corr$Validated & corr$RdValidated == step]
+			## Number of items to valid in order to acheive sample.size
+			numSample <- sample.size - length(notvalid.ids) 
+			if (numSample > 0) {
+				## Randomly select suspect items not validated
+				suspnotval.ids <- ids[!corr$Validated & corr$Suspect &
+					is.infinite(corr$RdValidated) & corr$Step == -1]
+				if (length(suspnotval.ids)) {
+					suspsample.ids <- as.numeric(sample(suspnotval.ids,
+						size = min(numSample, length(suspnotval.ids))))
+					corr$Step[suspsample.ids] <<- step
+					numSample <- numSample - length(suspsample.ids)        
+				}
+				if (numSample > 0) {
+					## Randomly select trusted items not validated
+					trustnotval.ids <- ids[!corr$Validated & !corr$Suspect  &
+						is.infinite(corr$RdValidated) & corr$Step == -1]
+					if (length(trustnotval.ids)) {
+						trustsample.ids <- as.numeric(sample(trustnotval.ids,
+							size = min(numSample, length(trustnotval.ids))))
+						corr$Step[trustsample.ids] <<- step
+					}
+				}
+			} 
+			nsuspect.tovalid <- length(ids[corr$Step == step & corr$Suspect])
+			ntrusted.tovalid <- length(ids[corr$Step == step & !corr$Suspect])
+			nsuspect.history <<- c(nsuspect.history, nsuspect)
+			ntrusted.history <<- c(ntrusted.history, ntrusted)
+			nsuspect.tovalid.history <<- c(nsuspect.tovalid.history, nsuspect.tovalid)
+			ntrusted.tovalid.history <<- c(ntrusted.tovalid.history, ntrusted.tovalid)
+		}
+		
+		if (mode != "stat") {
+			## Make sure the R Httpd server is started
+			tools <- getNamespace("tools")
+			port <- tools$httpdPort
+			if (port == 0) port <- startDynamicHelp(TRUE)
+			if (port == 0) stop("Impossible to start the R httpd server")
+			
+			subdir <- paste0("step", step + 1) # Because it start at 0,
+			## but this is really the beginning of next step now
+			stepdir <- file.path(testdir, subdir)
+			dir.create(stepdir)			
+			Zidb <- zidbLink(zidb)
+			## Write data in test directory
+			keepRows <- corr$Step == step
+			Vigns <- data[keepRows, "Id"]
+			imgext <- Zidb[[".ImageType"]]
+			## Extract all these images to stepdir
+			vigpath <- file.path(stepdir, paste(Vigns, imgext, sep = "."))
+			names(vigpath) <- Vigns
+			if (length(Vigns))
+				for (j in 1:length(Vigns))
+					writeBin(Zidb[[Vigns[j]]], vigpath[j])			
+			
+			## Create all directories of groups
+			path <- attr(classifier, "path")
+			names(path) <- basename(path)
+			## Create planktonSorter.html file
+			vigfiles <- basename(vigpath)
+			pred <- as.character(corr[keepRows, "Predicted"])
+			names(vigfiles) <- pred
+			Sample <- as.character(sub("\\+.+_[0-9]+", "", Vigns[1]))
+			
+			if (step > 0) {
+				## Recreate previous page with sorter items
+				oldPage <- file.path(testdir, paste0("step", step),
+					"planktonSorter.html")
+				oldItems <- corr$Step == (step - 1)
+				oldVigns <- paste(data[oldItems, "Id"], imgext, sep = ".")
+				oldNames <- as.character(corr[oldItems, "Actual"])
+				oldNames[is.na(oldNames)] <- "[other]"
+				names(oldVigns) <- oldNames
+				res <- planktonSorterPage(groups = path, vigns = oldVigns,
+					title = sorter.title, id = sorter.id,
+					step = step, file = oldPage)
+			}
+			
+			## Create new page
+			plSorterPage <- file.path(stepdir, "planktonSorter.html")
+			sorter.title <<- paste0(Sample, "/Step", step + 1)
+			sorter.id <<- paste(id, step + 1, sep = "/")
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/zooimage -r 245


More information about the Zooimage-commits mailing list