[Zooimage-commits] r245 - in pkg: mlearning phytoimage zooimage zooimage/R zooimage/inst zooimage/inst/planktonSorter zooimage/man zooimageJ
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sun Mar 2 08:52:45 CET 2014
Author: phgrosjean
Date: 2014-03-02 08:52:45 +0100 (Sun, 02 Mar 2014)
New Revision: 245
Added:
pkg/zooimage/R/correction.R
pkg/zooimage/R/planktonSorter.R
pkg/zooimage/TODO
pkg/zooimage/inst/planktonSorter/
pkg/zooimage/inst/planktonSorter/.DS_Store
pkg/zooimage/inst/planktonSorter/insert.gif
pkg/zooimage/inst/planktonSorter/jquery-1.11.0.min.js
pkg/zooimage/inst/planktonSorter/planktonSorter.css
pkg/zooimage/inst/planktonSorter/planktonSorter.js
pkg/zooimage/man/correctError.Rd
Modified:
pkg/mlearning/DESCRIPTION
pkg/phytoimage/DESCRIPTION
pkg/zooimage/DESCRIPTION
pkg/zooimage/NAMESPACE
pkg/zooimage/NEWS
pkg/zooimage/R/gui.R
pkg/zooimage/man/gui.Rd
pkg/zooimage/man/zooimage.package.Rd
pkg/zooimageJ/DESCRIPTION
Log:
Adjustment in Author vs Author at R fields in DESCRIPTION
Addition of error correction and (partial) suspects validation mechanism
Web user interface for validation (planktonSorter)
Modified: pkg/mlearning/DESCRIPTION
===================================================================
--- pkg/mlearning/DESCRIPTION 2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/mlearning/DESCRIPTION 2014-03-02 07:52:45 UTC (rev 245)
@@ -3,7 +3,12 @@
Title: Machine learning algorithms with unified interface and confusion matrices
Version: 1.0-0
Date: 2012-08-04
-Author: Ph. Grosjean & K. Denis
+Author: Philippe Grosjean [aut, cre],
+ Kevin Denis [aut]
+Authors at R: c(person("Philippe", "Grosjean", role = c("aut", "cre"),
+ email = "phgrosjean at sciviews.org"),
+ person("Kevin", "Denis", role = "aut",
+ email = "kevin.denis at umons.ac.be"))
Maintainer: Ph. Grosjean <Philippe.Grosjean at umons.ac.be>
Depends: R (>= 2.14.0)
Imports: stats, grDevices, class, nnet, MASS, e1071, randomForest, ipred
Modified: pkg/phytoimage/DESCRIPTION
===================================================================
--- pkg/phytoimage/DESCRIPTION 2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/phytoimage/DESCRIPTION 2014-03-02 07:52:45 UTC (rev 245)
@@ -3,10 +3,14 @@
Title: Analysis of numerical phytoplankton images
Version: 3.0-0
Date: 2012-05-07
-Author: Ph. Grosjean & K. Denis
+Author: Philippe Grosjean [aut, cre],
+ Kevin Denis [aut]
+Authors at R: c(person("Philippe", "Grosjean", role = c("aut", "cre"),
+ email = "phgrosjean at sciviews.org"),
+ person("Kevin", "Denis", role = "aut",
+ email = "kevin.denis at umons.ac.be"))
Maintainer: Ph. Grosjean <Philippe.Grosjean at umons.ac.be>
Depends: R (>= 2.14.0), svMisc (>= 0.9-66), svDialogs (>= 0.9-53), zooimage (>= 3.0-0)
-Suggests: zooimage
Description: PhytoImage is a free (open source) solution for analyzing digital
images of phytoplankton. In combination with ImageJ, a free image analysis
system, it processes digital images, measures individuals, trains for
Modified: pkg/zooimage/DESCRIPTION
===================================================================
--- pkg/zooimage/DESCRIPTION 2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/zooimage/DESCRIPTION 2014-03-02 07:52:45 UTC (rev 245)
@@ -1,12 +1,17 @@
Package: zooimage
Type: Package
Title: Analysis of numerical zooplankton images
-Version: 3.0-7
-Date: 2013-12-10
-Author: Ph. Grosjean, K. Denis & R. Francois
+Version: 4.0-0
+Date: 2014-02-23
+Author: Philippe Grosjean [aut, cre],
+ Kevin Denis [aut]
+Authors at R: c(person("Philippe", "Grosjean", role = c("aut", "cre"),
+ email = "phgrosjean at sciviews.org"),
+ person("Kevin", "Denis", role = "aut",
+ email = "kevin.denis at umons.ac.be"))
Maintainer: Philippe Grosjean <Philippe.Grosjean at umons.ac.be>
Depends: R (>= 2.14.0), svMisc (>= 0.9-67), svDialogs (>= 0.9-53), mlearning
-Imports: filehash, jpeg, png, tiff, utils
+Imports: filehash, jpeg, png, tiff, utils, digest, tools
Suggests: rJava, mlbench
Description: ZooImage is a free (open source) solution for analyzing digital
images of zooplankton. In combination with ImageJ, a free image analysis
Modified: pkg/zooimage/NAMESPACE
===================================================================
--- pkg/zooimage/NAMESPACE 2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/zooimage/NAMESPACE 2014-03-02 07:52:45 UTC (rev 245)
@@ -1,4 +1,6 @@
+import(digest)
import(utils)
+import(tools)
#import(tcltk)
#import(tcltk2)
import(svMisc)
@@ -20,6 +22,9 @@
import(mlearning)
#import(party)
+# planktonSorter
+export(correctError)
+
# Zic
export(zicCheck)
@@ -152,6 +157,7 @@
export(processSamples)
export(removeObjects)
export(saveObjects)
+export(validClass)
export(vignettesClass)
export(viewManual)
export(viewResults)
Modified: pkg/zooimage/NEWS
===================================================================
--- pkg/zooimage/NEWS 2013-12-10 11:07:44 UTC (rev 244)
+++ pkg/zooimage/NEWS 2014-03-02 07:52:45 UTC (rev 245)
@@ -1,5 +1,17 @@
= zooimage News
+== Changes in zooimage 4.0-0
+
+* Error correction functions added: correctError().
+
+* Plankton sorter interface to manual validation added.
+
+* New dependency on the digest package.
+
+* A new menu entry, Validate classification..., is added for validation and
+ error correction.
+
+
== Changes in zooimage 3.0-7
* readFlowCAMlst() can now read Visual Spreadsheet .lst file format 017 where
Added: pkg/zooimage/R/correction.R
===================================================================
--- pkg/zooimage/R/correction.R (rev 0)
+++ pkg/zooimage/R/correction.R 2014-03-02 07:52:45 UTC (rev 245)
@@ -0,0 +1,1110 @@
+## TODO: disable next button when in last fraction
+## TODO: save temporary and final results in the zidb file
+## TODO: correct bugs with the back button
+
+## Load libraries
+#library(svMisc) #, lib.loc = "./Libraries/V3")
+#library(svDialogs) #, lib.loc = "./Libraries/V3")
+#library(zooimage) #, lib.loc = "./Libraries/V3")
+##library(RANN) # Only function: nn2()... apparently not used here!
+#library(randomForest)
+#library(e1071)
+#library(RWeka)
+#library(C50)
+#library(ipred)
+
+#### Functions not used, but kept for possible future use ######################
+## Area under the curve
+auc <- function (x, y) {
+ if (length(x) != length(y) || length(x) < 2)
+ stop("'x' and 'y' must have same length >= 2")
+ sum(diff(x) * (y[-1] + y[-length(y)]) / 2)
+}
+
+
+#### Functions NOT used by errorCorrection(), but by examples ##################
+## TODO: corrHist() and addPoints() are used together... make it one plot() method?
+## TODO: make it S3 object with print, summry and plot methods + predict + ...
+## Function to plot size spectrum of particles.
+## Can be used for error correction of biomass (perspectives)
+corrHist <- function (data, subset, sizeParam, interval) {
+ data <- data[subset, sizeParam]
+ if (!length(data)) return(0)
+ hist(data, breaks = seq(min(data) - (2* interval), max(data) +
+ (2 * interval), interval), plot = FALSE)
+}
+
+## Add points to an histogram
+addPoints <- function (x, ...) {
+ if (!inherits(x, "histogram"))
+ stop("x must be an object of class 'histogram'")
+ points(x = x$mids[x$counts > 0], y = x$counts[x$counts > 0], ...)
+}
+
+## Calculation of the residual error based on global error estimation
+residError <- function (ErrorEstim, Error, Validated, na.rm = FALSE) {
+ nObs <- length(Validated)
+ if (sum(Validated) == nObs) return(0) # No error since everybody validated
+ nErr <- ErrorEstim * nObs
+ nErrVal <- sum(Error & Validated, na.rm = na.rm)
+ nErrRes <- nErr - nErrVal
+ max(nErrRes / nObs, 0) # To avoid potential negative values
+}
+
+## Evolution of residual error
+residErrorEvol <- function (ErrorEstimEvol, Error, Validated, Step,
+na.rm = FALSE) {
+ nIter <- length(ErrorEstimEvol)
+ if (nIter == 0) return(numeric(0))
+
+ res <- numeric(nIter)
+ for (i in 1:nIter) {
+ Valid <- Validated & Step < i #Step %in% 0:(i - 1)
+ res[i] <- residError(ErrorEstim = ErrorEstimEvol[i], Error = Error,
+ Validated = Valid, na.rm = na.rm)
+ }
+ res
+}
+
+
+#### Functions used both by errorCorrection() and by examples ##################
+## Compute the Bray-Curtis dissimilarity index on two vectors
+## Note that this is faster than vegan::vegdist for two vectors
+## and it remains faster for a vector versus a matrix. But fir all pairs,
+## use vegan::vegdist(x, method = "bray") instead
+## x must be a reference vector, and y can be a vector of same length,
+## or a matrix or a data frame with same number of rows
+dissimilarity <- function (x, y, na.rm = FALSE) {
+ if (is.data.frame(y)) y <- as.matrix(y)
+ if (!is.numeric(x) || !is.numeric(y))
+ stop("'x' and 'y' must be numeric")
+ if (!is.null(dim(x)))
+ stop("'x' must be a vector, not a matrix or an array")
+ if (is.matrix(y)) {
+ if (length(x) != nrow(y))
+ stop("'y' must have same rows as the length of 'x'")
+
+ ## The matrix calculation version
+ colSums(abs(x - y), na.rm = na.rm) /
+ colSums(rbind(y, sum(x, na.rm = na.rm)), na.rm = na.rm)
+
+ } else { # x and y must be vectors of same length
+ if (!is.null(dim(y)) || length(x) != length(y))
+ stop("'x' and 'y' must be two vectors of same length")
+
+ ## The simpler vector version
+ sum(abs(x - y), na.rm = na.rm) / sum(x, y, na.rm = na.rm)
+ }
+}
+
+
+#### Functions used only by errorCorrection() ##################################
+## Calculation of the global error based on validated items
+## /!\ NOT based on items randomly selected --> Bad approximation
+.globalError <- function (suspect, error, subset, na.rm = FALSE) {
+ suspTot <- sum(suspect, na.rm = na.rm)
+ trustTot <- sum(!suspect, na.rm = na.rm)
+ vSusp <- suspect[subset]
+ vTrust <- !suspect[subset]
+ susp <- sum(vSusp, na.rm = na.rm)
+ trust <- sum(vTrust, na.rm = na.rm)
+ pSusp <- susp / suspTot
+ pTrust <- trust / trustTot
+ wTrust <- pSusp / pTrust
+ if (is.na(wTrust) || !is.finite(wTrust)) wTrust <- 1
+ vErr <- error[subset]
+ err <- sum(vErr[vSusp], na.rm = na.rm) + wTrust * sum(vErr[vTrust],
+ na.rm = na.rm)
+ tot <- susp + wTrust * trust
+ err / tot
+}
+
+## Error in each of the four fractions
+.errorInFractions <- function (suspect, error, validated, iteration = NA,
+ na.rm = FALSE) {
+
+ ## The four fractions
+ suspValIdx <- suspect & validated
+ suspNotValIdx <- suspect & !validated
+ trustValIdx <- !suspect & validated
+ trustNotValIdx <- !suspect & !validated
+
+ ## Error in each fraction
+ errSuspValIdx <- error[suspValIdx]
+ errSuspNotValIdx <- error[suspNotValIdx]
+ errTrustValIdx <- error[trustValIdx]
+ errTrustNotValIdx <- error[trustNotValIdx]
+
+ ## Number of items in each fraction
+ nSuspVal <- sum(suspValIdx, na.rm = na.rm)
+ nSuspNotVal <- sum(suspNotValIdx, na.rm = na.rm)
+ nSuspTot <- nSuspVal + nSuspNotVal
+ nTrustVal <- sum(trustValIdx, na.rm = na.rm)
+ nTrustNotVal <- sum(trustNotValIdx, na.rm = na.rm)
+ nTrustTot <- nTrustVal + nTrustNotVal
+
+ ## Number of error in each fraction
+ nErrSuspVal <- sum(errSuspValIdx, na.rm = na.rm)
+ nErrSuspNotVal <- sum(errSuspNotValIdx, na.rm = na.rm)
+ nErrSuspTot <- nErrSuspVal + nErrSuspNotVal
+ nErrTrustVal <- sum(errTrustValIdx, na.rm = na.rm)
+ nErrTrustNotVal <- sum(errTrustNotValIdx, na.rm = na.rm)
+ nErrTrustTot <- nErrTrustVal + nErrTrustNotVal
+
+ ## Number of error in validated fraction if random distribution of the error
+ nErrSuspValRd <- nErrSuspTot / nSuspTot * nSuspVal
+ nErrTrustValRd <- nErrTrustTot / nTrustTot * nTrustVal
+
+ ## Error rate in each fraction
+ errSuspVal <- nErrSuspVal / nSuspVal
+ errSuspNotVal <- nErrSuspNotVal / nSuspNotVal
+ errTrustVal <- nErrTrustVal / nTrustVal
+ errTrustNotVal <- nErrTrustNotVal / nTrustNotVal
+
+ ## Weight for trusted items
+ probaSusp <- nSuspVal / nSuspTot
+ probaTrust <- nTrustVal / nTrustTot
+ weightTrust <- probaSusp / probaTrust
+
+ ## Results: data frame
+ if (!all(is.na(iteration))) {
+ ## Calculation of error in validated fraction at current iteration
+ suspStepIdx <- suspect & iteration
+ trustStepIdx <- !suspect & iteration
+ errSuspStepIdx <- error[suspStepIdx]
+ errTrustStepIdx <- error[trustStepIdx]
+ nSuspStep <- sum(suspStepIdx, na.rm = na.rm)
+ nTrustStep <- sum(trustStepIdx, na.rm = na.rm)
+ nErrSuspStep <- sum(errSuspStepIdx, na.rm = na.rm)
+ nErrTrustStep <- sum(errTrustStepIdx, na.rm = na.rm)
+ errSuspStep <- nErrSuspStep / nSuspStep
+ errTrustStep <- nErrTrustStep / nTrustStep
+
+ res <- data.frame(errSuspVal = errSuspVal, errTrustVal = errTrustVal,
+ errSuspNotVal = errSuspNotVal, errTrustNotVal = errTrustNotVal,
+ errSuspStep = errSuspStep, errTrustStep = errTrustStep,
+ nErrSuspTot = nErrSuspTot, nErrTrustTot = nErrTrustTot,
+ nErrSuspVal = nErrSuspVal, nErrTrustVal = nErrTrustVal,
+ nErrSuspStep = nErrSuspStep, nErrTrustStep = nErrTrustStep,
+ nErrSuspValRd = nErrSuspValRd, nErrTrustValRd = nErrTrustValRd,
+ nErrSuspNotVal = nErrSuspNotVal, nErrTrustNotVal = nErrTrustNotVal,
+ nSuspTot = nSuspTot, nTrustTot = nTrustTot, nSuspVal = nSuspVal,
+ nTrustVal = nTrustVal, nSuspStep = nSuspStep,
+ nTrustStep = nTrustStep, nSuspNotVal = nSuspNotVal,
+ nTrustNotVal = nTrustNotVal, weightTrust = weightTrust)
+ } else {
+ ## Calculation of error in global sample
+ res <- data.frame(errSuspVal = errSuspVal, errTrustVal = errTrustVal,
+ errSuspNotVal = errSuspNotVal, errTrustNotVal = errTrustNotVal,
+ nErrSuspTot = nErrSuspTot, nErrTrustTot = nErrTrustTot,
+ nErrSuspVal = nErrSuspVal, nErrTrustVal = nErrTrustVal,
+ nErrSuspValRd = nErrSuspValRd, nErrTrustValRd = nErrTrustValRd,
+ nErrSuspNotVal = nErrSuspNotVal, nErrTrustNotVal = nErrTrustNotVal,
+ nSuspTot = nSuspTot, nTrustTot = nTrustTot, nSuspVal = nSuspVal,
+ nTrustVal = nTrustVal, nSuspNotVal = nSuspNotVal,
+ nTrustNotVal = nTrustNotVal, weightTrust = weightTrust)
+ }
+ res
+}
+
+## Compute probabilities that will be used for suspect detection
+classProba <- function (data, predicted = data$Predicted, classifier,
+diff.max = 0.2, prop.bio = NULL) {
+ ## Get first and second identification probabilities
+ ## TODO: only works for randomForest and does not use mlearning!
+ if (inherits(classifier, "randomForest")) {
+ if (inherits(classifier, "ZIClass")) {
+ data <- attr(classifier, "calc.vars")(data)
+ }
+ class(classifier) <- class(classifier)[-1]
+ prob <- predict(classifier, newdata = data, class.only = FALSE,
+ type = "membership")
+ } else stop("Suspect detection not yet implemented for this algorithm")
+ max.ids <- apply(prob, 1, sort, decreasing = TRUE)[1:2, ]
+ first <- max.ids[1, ]
+ second <- max.ids[2, ]
+
+ ## 1) Coefficient quantifying quality of identification
+ identCoef <- sqrt(first * pmin((first - second) / (first * diff.max), 1))
+
+ ## 2) Probability if identification in that group
+ #first
+
+ ## 3) Difference between first and second probabilities
+ probaDiff <- first - second
+
+ ## 4) Proportion of the group (rares groups tend to have more FP!)
+ predTable <- table(predicted)
+ prop <- predTable / sum(predTable)
+ gpProp <- prop[as.character(predicted)]
+
+ # 5) ProbaBio modulation?
+ if (is.null(prop.bio)) {
+ proba <- data.frame(IdentCoef = identCoef, TreeProp = first,
+ ProbaDiff = probaDiff, GpProp = gpProp)
+ } else {
+ bioFact <- prop.bio[as.character(predicted)]
+ proba <- data.frame(IdentCoef = identCoef, TreeProp = first,
+ ProbaDiff = probaDiff, GpProp = gpProp, BioFact = bioFact)
+ }
+ attr(proba, "IdentParam") <- list(DiffMax = diff.max,
+ ProbaBio = prop.bio, ProbaTable = prob)
+ proba
+}
+
+## Compute the proportion of false positives according to Bayes theorem
+corrProba <- function (proba, predicted, confusion, nobs) {
+ stats <- summary(confusion, type = c("Recall", "Specificity"))
+ table.pred <- table(predicted)
+ prop <- table.pred / nobs
+ recall <- stats$Recall
+ specif <- stats$Specificity
+ fpprop <- numeric(length(table.pred))
+ for (name in names(table.pred)) {
+ prop.a <- prop[name]
+ prop.b <- 1 - prop.a
+ recall.a <- stats[name, ]$Recall
+ fprate <- 1 - stats[name, ]$Specificity
+ b.in.a <- fprate * prop.b
+ a.in.a <- recall.a * prop.a
+ fpprop[name] <- b.in.a / (b.in.a + a.in.a)
+ if (fpprop[name] == "NaN") fpprop[name] <- 1
+ }
+ proba$FP <- fpprop[as.character(predicted)]
+ ## Compare this proportion to group proportion
+ proba$GpFPDiff <- proba$GpProp - proba$FP
+ proba
+}
+
+
+## Detect suspect particles using different algorithms and update corr$Suspect
+suspect <- function (data, proba, error, subset, predicted, confusion,
+algorithm = "rf", knn.k = 3, svm.kernel = "polynomial", svm.degree = 5, ...) {
+ algorithm <- match.arg(algorithm,
+ c("rf", "knn", "svm", "bayes", "c50", "glm"))
+
+ ## In the improbable case we have no error at all, consider everyone as suspect...
+ if (all(as.character(error) != "Error"))
+ return(factor(rep("Error", nrow(set)), levels = levels(error)))
+
+ ## Update proba with latest calculations, according to a corrected confusion matrix
+ proba <- corrProba(proba, predicted, confusion, nrow(data))
+ set <- cbind(data, proba, Error = error)
+
+ ## TODO: mlearning is there for complete interchangeability of algorithms
+ ## thus, we should not need this switch() construct!
+ res <- switch(algorithm,
+ rf = {
+ rf.model <- mlearning(formula = Error ~ ., data = set,
+ subset = subset, method = "mlRforest", ...)
+ susp <- predict(rf.model, newdata = set, type = "class")
+ susp[subset] <- predict(rf.model, type = "class", method = "oob")
+ },
+ knn = {
+ # set <- subset(set, select = -c(Error))
+ # warning("Prediction of training set by 'knn' method without cross validation")
+ # susp <- knn(train = set[subset, ], test = set,
+ # cl = corr$Error[subset], k = knn.k, ...)
+ stop("Not implemented yet!")
+ },
+ svm = {
+ # svm.model <- mlearning(Error ~ ., data = set,
+ # subset = subset, kernel = svm.kernel, degree = svm.degree,
+ # method = "mlSvm", ...)
+ # susp <- predict(svm.model, newdata = set, type = "class")
+ # susp[subset] <- predict(svm.model, type = "class", method = "cv")
+ stop("Not implemented yet!")
+ },
+ bayes = {
+ # nb.model <- mlearning(Error ~ ., data = set,
+ # subset = subset, method = "mlNaiveBayes", ...)
+ # susp <- predict(nb.model, newdata = set, type = "class")
+ # susp[subset] <- predict(nb.model, type = "class", method = "cv")
+ stop("Not implemented yet!")
+ },
+ c50 = {
+ # C50.train <- set[subset, ]
+ # c50.model <- C5.0(formula = Error ~ ., data = C50.train, ...)
+ # susp <- predict(c50.model, newdata = set, type = "class")
+ # susp[subset] <- cv(y = C50.train$Error, formula = Error ~ .,
+ # data = C50.train, model = C5.0, predict =
+ # function(object, newdata) predict(object, newdata,
+ # type = "class"), k = 10, predictions = TRUE)$predictions
+ stop("Not implemented yet!")
+ },
+ glm = {
+ # glm.model <- Logistic(Error ~ ., data = set, subset = subset, ...)
+ # warning("Prediction of training set by 'glm' method without cross-validation")
+ # susp <- predict(glm.model, newdata = set, type = "class")
+ stop("Not implemented yet!")
+ },
+ stop("Unknown algorithm '", algorithm, "'")
+ )
+ susp
+}
+
+## Main function for error correction
+## Replace data by ZIobj and put zidb in first position... or group both items
+## and how to retrieve class then???
+## Group options in a single object too
+errorCorrection <- function (data, classifier, mode = "validation",
+fraction = 0.05, sample.min = 100, grp.min = 2, random.sample = 0.1,
+algorithm = "rf", diff.max = 0.2, prop.bio = NULL,
+zidb = NULL, testdir = NULL, id = NULL,
+result = ".last_valid", envir = parent.frame()) {
+ #### Parameters explanations
+ #### Data and classifier
+ ## data -- the dataset to study
+ if (missing(data) || !inherits(data, "ZIDat"))
+ stop("data must be a ZIdat object")
+ ## classifier -- the classifier to use to classify particles
+ if (missing(classifier) || !inherits(classifier, "ZIClass"))
+ stop("classifier must be a ZIClass object")
+####### calc.vars <- attr(classifier, "calc.vars")
+
+ #### General parameters of the iterative correction
+ ## mode -- the mode (validation, stat or demo)
+ mode <- match.arg(mode, c("validation", "demo", "stat"))
+ if (mode != "validation" & is.null(data$Class))
+ stop("data requires a 'Class' column in this mode")
+ ## fraction -- fraction of sample to validate
+ fraction <- as.numeric(fraction)[1]
+ if (fraction < 0.01 || fraction > 1)
+ stop("fraction must be between 0.01 and 1")
+ ## sample.min -- minimum number of particles to validate ate each step
+ sample.min <- as.integer(sample.min)[1]
+ if (sample.min < 1)
+ stop("sample.min must be a positive integer")
+ ## grp.min -- minimum number of particles of each group to validate
+ grp.min <- as.integer(grp.min)[1]
+ if (grp.min < 1 || grp.min > sample.min)
+ stop("grp.min must be a positive integer and cannot be larger than sample.min")
+ ## random.sample -- fraction of random sample in the validation set
+ random.sample <- as.numeric(random.sample)[1]
+ if (random.sample < 0 || random.sample > 1)
+ stop("random.sample must be a number between 0 and 1")
+
+ #### Parameters for the detection of suspects
+ ## algorithm -- algorithm used to detect suspect particles
+ ## diff.max -- maximum toalerated difference in probabilities for class identification
+ diff.max <- as.numeric(diff.max)[1]
+ if (diff.max < 0)
+ stop("diff.max must be a positive number or zero")
+ ## proba.bio -- groups probabilities, using biological information
+ if (length(prop.bio) && (!is.numeric(prop.bio) || is.null(names(prop.bio))))
+ stop("prop.bio must be a named vector (groups) of numbers")
+
+ ## zidb -- path to the zidbfile to manually validate
+ ## testdir -- path of the directory used for manual validation
+
+ ## Variables
+ proba <- NULL # identification probabilities (additional vars for suspect detection)
+ corr <- NULL # main informations about the correction
+ validated <- NULL # validated class of the particles
+ validation.data <- NULL # Data send as validation of a given step
+ predicted <- NULL # predictions at the beginning
+ step <- -1 # current iteration
+ sample.size <- NULL # size of the next subsample to validate
+ ## TODO: eliminate this and get it from table(corr$Step)
+ validated.fracs <- 0 # fraction of items validated at each step
+
+ ## Manual validation variables
+ ntrusted <- NULL # number of trusted particles detected
+ nsuspect <- NULL # number of suspect particles detected
+ ntrusted.tovalid <- NULL # number of trusted particles to validate
+ nsuspect.tovalid <- NULL # number of suspect particles to validate
+ testset <- NULL # subsample to validate
+
+ ## Validation of particles TODO: get rid of these two flags!
+ step.manual <- FALSE # should the user validate particles?
+ testset.validated <- FALSE # is the testset validated?
+
+ ## Plankton sorter variables
+ sorter.title <- NULL # title of the plankton sorter page
+ sorter.id <- NULL # identifier of the plankton sorter page
+
+ ## Results
+ manual.history <- NULL # history of the manual confusion matrix
+ manual2.history <- NULL # history of manual + 2nd ident confusion matrix
+ corr.confusion <- NULL # corrected confusion matrix
+ correction.history <- NULL # history of the correction confusion matrix
+ error.estim.data <- NULL # data used to estimate the error
+ error.estim <- NULL # history of the error estimation
+ error.estim.history <- NULL # evolution of the error estimation
+ error.estim.rd <- NULL # error using validated, randomly selected items
+ error.estim.rd.history <- NULL # evolution of the error estimation
+ suspect.history <- list() # history of suspect detection
+ validated.history <- list() # history of validated particles
+ error.valid.history <- list() # history of error among validated items
+ error.suspect.history <- list() # history of error among suspect items
+ error.trusted.history <- list() # history of error among suspect items
+ nsuspect.history <- NULL # history of the number of suspect items
+ ntrusted.history <- NULL # history of the number of trusted items
+ nsuspect.tovalid.history <- NULL # history of suspect validation
+ ntrusted.tovalid.history <- NULL # history of trusted validation
+ errInFract.history <- data.frame() # evolution of the error in each fraction
+
+ ## Initialize the data for error correction (data, proba, corr and others)
+ initialize <- function () {
+ ## Check that this directory exists and is empty
+ if (mode != "stat") {
+ if (is.null(testdir))
+ testdir <<- file.path(tempdir(),
+ zooimage::noExtension(zidb))
+ if (file.exists(testdir)) {
+
+ res <- dlgMessage(paste("Temporary validation directory already",
+ "exists. Do we erase old data there?"), type = "okcancel")$res
+ if (res == "cancel")
+ stop("testdir (", testdir, ") must not exist yet!")
+ unlink(testdir, recursive = TRUE)
+ }
+ dir.create(testdir, recursive = TRUE)
+ if (!file.exists(testdir))
+ stop("cannot create 'testdir'!")
+ ## Put required files there: create the planktonSorter directory
+ plSort <- file.path(testdir, "planktonSorter")
+ dir.create(plSort)
+ plFiles <- dir(system.file("planktonSorter", package = "zooimage"),
+ full.names = TRUE)
+ res <- file.copy(plFiles, file.path(plSort, basename(plFiles)))
+ if (any(!res))
+ stop("Problem when copying one or more plankton sorter files")
+ }
+
+ ## In the other modes, indicate that we prepare the validation environment
+ cat("Preparing a validation environment...\n")
+
+ ## Be sure that data is correctly ordered
+ data <<- data[order(data$Item), ]
+ ## Be sure 'Id' exists in data
+ data$Id <- makeId(data)
+
+ ## Make sure items' membership is predicted by the classifier
+ predicted <- data$Predicted
+ ## TODO: shouldn't we do this all the time???
+ if (is.null(predicted)) # Predict class if it wasn't done yet
+ predicted <- predict(classifier, data, class.only = TRUE,
+ type = "class")
+ predicted <<- predicted
+
+# if (mode != "validation") {
+ ## Store validated items and table of validated items
+ validated <<- data$Class
+ ## TODO: why this, and Class put in validated?
+ data$Class <- NULL
+ data <<- data
+# }
+ ## Keep all data
+### data <<- attr(classifier, "calc.vars")(data)
+### ## Do not keep uncomplete attributes
+### data <<- data[, sapply(data, FUN = function(x) !any(is.na(x)))]
+ proba <<- classProba(data, predicted, classifier, diff.max = diff.max,
+ prop.bio = prop.bio)
+ nobs <- nrow(data)
+ error <- factor(rep("Error", nobs), levels = c("Correct", "Error"))
+ ## Compute the second possible identification
+ secondIdent <- function (groups) {
+ proba.table <- attr(proba, "IdentParam")$ProbaTable
+ proba.order <- apply(proba.table, 1, order, decreasing = TRUE)
+
+ fst.ids <- proba.order[1, ]
+ scd.ids <- proba.order[2, ]
+
+ proba.max <- apply(proba.table, 1, sort, decreasing = TRUE)[1, ]
+ max.ids <- proba.max == 1
+ scd.ids[max.ids] <- fst.ids[max.ids]
+
+ factor(groups[scd.ids], levels = groups)
+ }
+ predicted2 <- secondIdent(levels(predicted))
+
+ ## Creation of corr object
+ corr <<- data.frame(Actual = predicted, Actual2 = predicted2,
+ Predicted = predicted, Predicted2 = predicted2, Validated = FALSE,
+ Error = error, Step = step, Suspect = rep(TRUE, nobs),
+ RdValidated = rep(Inf, nobs), OtherGp = rep(FALSE, nobs))
+
+ ## Confusion matrix of original classifier
+ train.validated <- attr(classifier, "response")
+ train.predicted <- attr(classifier, "cvpredict")
+ train.conf <- confusion(x = train.predicted, y = train.validated)
+
+ ## First error correction: we start with Solow et al. estimation
+ ## Compute correction of abundances using a method similar to Solow et al.,
+ ## but that is not stuck with singular matrices problems
+ ## TODO: this takes a long time => try Solow et al first before using this!
+ correctionLab <- function (x, conf) {
+ l <- length(x)
+ if (l != ncol(conf))
+ stop("'x' has not the same length than 'conf'")
+
+ toMat <- function(x, byrow = TRUE)
+ matrix(rep(x, l), ncol = l, byrow = byrow)
+
+ predMat <- toMat(colSums(conf))
+ xMat <- toMat(x)
+
+ ## Remove 0
+ notZero <- function (x) {
+ if (0 %in% x) x[x==0] <- 1
+ x
+ }
+
+ confB <- conf / notZero(predMat) * xMat
+ x2 <- rowSums(confB)
+ classMat <- toMat(rowSums(conf), byrow = FALSE)
+ while (!isTRUE(all.equal(x, x2, tolerance = 0.00001))) {
+ x <- x2
+ confA <- conf / notZero(classMat) * toMat(x2, byrow = FALSE)
+ xMat2 <- toMat(colSums(confA))
+ confB <- confA / notZero(xMat2) * xMat
+ x2 <- rowSums(confB)
+ }
+ round(x2)
+ }
+ tablePredicted <- table(predicted)
+ correction.history <<- correctionLab(x = tablePredicted,
+ conf = train.conf)
+ manual.history <<- tablePredicted
+ manual2.history <<- manual.history
+
+ ## Increment step (should be 0 now, because initial value is -1)
+ step <<- step + 1
+ ## Determine the number of vignettes to manually validate
+ setSampleSize()
+ }
+
+ ## Compute the size of the next subsample: update sample.size
+ setSampleSize <- function () {
+ sample.size <<- round(min(nrow(corr[!corr$Validated, ]), # How much remains?
+ ## Or number of items we want to take
+ max(nrow(data) * fraction, # Items to take
+ sample.min, # Minimum items we can take
+ ## TODO: check the following one!
+ grp.min * length(table(predicted))))) # Minimum from groups
+ }
+
+ ## Determine the subsample to validate
+ ## Update Step and RdValidated (used for error estimation)
+ ## Automatically place vignettes in directories for manual validation
+ prepareTest <- function () {
+ nobs <- nrow(data)
+ if (step < 1) {
+ ## At first time, take a random subsample
+ ## Same as considering everything as suspect
+ sample.ids <- sample(1:nobs, size = sample.size)
+ corr$Step[sample.ids] <<- step
+ corr$RdValidated[sample.ids] <<- step
+ nsuspect.tovalid.history <<- c(nsuspect.tovalid.history, sample.size)
+ ntrusted.tovalid.history <<- c(ntrusted.tovalid.history, 0)
+ nsuspect.history <<- c(nsuspect.history, nobs)
+ ntrusted.history <<- c(ntrusted.history, 0)
+ } else { # Step > 0
+ ## Mix trusted and suspect particles
+ #validated <- corr[corr$Validated, ]
+ notvalidated <- corr[!corr$Validated, ]
+ nsuspect <<- sum(notvalidated$Suspect) #nrow(notvalidated[notvalidated$Suspect, ])
+ ntrusted <<- nrow(notvalidated) - nsuspect #nrow(notvalidated[!notvalidated$Suspect, ])
+ ## Determine the number of random items used in error estimation
+ numRandom <- max(sample.size - nsuspect,
+ round(random.sample * sample.size))
+ ids <- as.character(1:nobs)
+ ## All items not included in RdValidated
+ tosample.ids <- ids[is.infinite(corr$RdValidated)]
+ ## Items to validate
+ randomsample.ids <- as.numeric(sample(tosample.ids,
+ size = min(numRandom, length(tosample.ids))))
+ ## Used to estimate error at step i
+ corr$RdValidated[randomsample.ids] <<- step
+ newstep <- corr$Step[randomsample.ids]
+ ## Except those already validated and used for error estimation
+ newstep[newstep == -1] <- step
+ corr$Step[randomsample.ids] <<- newstep
+ notvalid.ids <- ids[!corr$Validated & corr$RdValidated == step]
+ ## Number of items to valid in order to acheive sample.size
+ numSample <- sample.size - length(notvalid.ids)
+ if (numSample > 0) {
+ ## Randomly select suspect items not validated
+ suspnotval.ids <- ids[!corr$Validated & corr$Suspect &
+ is.infinite(corr$RdValidated) & corr$Step == -1]
+ if (length(suspnotval.ids)) {
+ suspsample.ids <- as.numeric(sample(suspnotval.ids,
+ size = min(numSample, length(suspnotval.ids))))
+ corr$Step[suspsample.ids] <<- step
+ numSample <- numSample - length(suspsample.ids)
+ }
+ if (numSample > 0) {
+ ## Randomly select trusted items not validated
+ trustnotval.ids <- ids[!corr$Validated & !corr$Suspect &
+ is.infinite(corr$RdValidated) & corr$Step == -1]
+ if (length(trustnotval.ids)) {
+ trustsample.ids <- as.numeric(sample(trustnotval.ids,
+ size = min(numSample, length(trustnotval.ids))))
+ corr$Step[trustsample.ids] <<- step
+ }
+ }
+ }
+ nsuspect.tovalid <- length(ids[corr$Step == step & corr$Suspect])
+ ntrusted.tovalid <- length(ids[corr$Step == step & !corr$Suspect])
+ nsuspect.history <<- c(nsuspect.history, nsuspect)
+ ntrusted.history <<- c(ntrusted.history, ntrusted)
+ nsuspect.tovalid.history <<- c(nsuspect.tovalid.history, nsuspect.tovalid)
+ ntrusted.tovalid.history <<- c(ntrusted.tovalid.history, ntrusted.tovalid)
+ }
+
+ if (mode != "stat") {
+ ## Make sure the R Httpd server is started
+ tools <- getNamespace("tools")
+ port <- tools$httpdPort
+ if (port == 0) port <- startDynamicHelp(TRUE)
+ if (port == 0) stop("Impossible to start the R httpd server")
+
+ subdir <- paste0("step", step + 1) # Because it start at 0,
+ ## but this is really the beginning of next step now
+ stepdir <- file.path(testdir, subdir)
+ dir.create(stepdir)
+ Zidb <- zidbLink(zidb)
+ ## Write data in test directory
+ keepRows <- corr$Step == step
+ Vigns <- data[keepRows, "Id"]
+ imgext <- Zidb[[".ImageType"]]
+ ## Extract all these images to stepdir
+ vigpath <- file.path(stepdir, paste(Vigns, imgext, sep = "."))
+ names(vigpath) <- Vigns
+ if (length(Vigns))
+ for (j in 1:length(Vigns))
+ writeBin(Zidb[[Vigns[j]]], vigpath[j])
+
+ ## Create all directories of groups
+ path <- attr(classifier, "path")
+ names(path) <- basename(path)
+ ## Create planktonSorter.html file
+ vigfiles <- basename(vigpath)
+ pred <- as.character(corr[keepRows, "Predicted"])
+ names(vigfiles) <- pred
+ Sample <- as.character(sub("\\+.+_[0-9]+", "", Vigns[1]))
+
+ if (step > 0) {
+ ## Recreate previous page with sorter items
+ oldPage <- file.path(testdir, paste0("step", step),
+ "planktonSorter.html")
+ oldItems <- corr$Step == (step - 1)
+ oldVigns <- paste(data[oldItems, "Id"], imgext, sep = ".")
+ oldNames <- as.character(corr[oldItems, "Actual"])
+ oldNames[is.na(oldNames)] <- "[other]"
+ names(oldVigns) <- oldNames
+ res <- planktonSorterPage(groups = path, vigns = oldVigns,
+ title = sorter.title, id = sorter.id,
+ step = step, file = oldPage)
+ }
+
+ ## Create new page
+ plSorterPage <- file.path(stepdir, "planktonSorter.html")
+ sorter.title <<- paste0(Sample, "/Step", step + 1)
+ sorter.id <<- paste(id, step + 1, sep = "/")
[TRUNCATED]
To get the complete diff run:
svnlook diff /svnroot/zooimage -r 245
More information about the Zooimage-commits
mailing list