[Zooimage-commits] r229 - in pkg/mlearning: . R man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Sun Aug 5 01:42:46 CEST 2012


Author: phgrosjean
Date: 2012-08-05 01:42:45 +0200 (Sun, 05 Aug 2012)
New Revision: 229

Modified:
   pkg/mlearning/DESCRIPTION
   pkg/mlearning/NAMESPACE
   pkg/mlearning/R/confusion.R
   pkg/mlearning/R/mlearning.R
   pkg/mlearning/man/confusion.Rd
   pkg/mlearning/man/mlearning.Rd
   pkg/mlearning/man/mlearning.package.Rd
Log:
Further changes, mainly to confusion plots

Modified: pkg/mlearning/DESCRIPTION
===================================================================
--- pkg/mlearning/DESCRIPTION	2012-08-02 19:23:08 UTC (rev 228)
+++ pkg/mlearning/DESCRIPTION	2012-08-04 23:42:45 UTC (rev 229)
@@ -1,18 +1,14 @@
 Package: mlearning
 Type: Package
-Title: Machine learning algorithms with unified formula interface and confusion matrices
+Title: Machine learning algorithms with unified interface and confusion matrices
 Version: 1.0-0
-Date: 2012-07-18
+Date: 2012-08-04
 Author: Ph. Grosjean & K. Denis
 Maintainer: Ph. Grosjean <Philippe.Grosjean at umons.ac.be>
 Depends: R (>= 2.14.0)
-Imports: dataframe, stats, nnet, class, MASS, e1071, randomForest, RWeka, ipred, grDevices 
-Suggests: rJava, RWekajars, mlbench, datasets, RColorBrewer
-Description: This package provides alternate interface to various machine
-    learning algorithms in order to offer a unified, formula-based, interface.
-    However, given the caveats of the formula interface in R (leading to
-    multiple copies of the data), if a simplified formula is provided (like
-    Class ~ .), a more performant code is used, if possible. Confusion matrices
-    can be calculated and viewed as tables or plots, too.
+Imports: stats, grDevices, class, nnet, MASS, e1071, randomForest, ipred 
+Suggests: mlbench, datasets, RColorBrewer
+Description: This package provides a unified interface to various machine
+    learning algorithms. Confusion matrices are provided too.
 License: GPL (>= 2)
 URL: http://www.sciviews.org/zooimage
\ No newline at end of file

Modified: pkg/mlearning/NAMESPACE
===================================================================
--- pkg/mlearning/NAMESPACE	2012-08-02 19:23:08 UTC (rev 228)
+++ pkg/mlearning/NAMESPACE	2012-08-04 23:42:45 UTC (rev 229)
@@ -1,18 +1,15 @@
-import(dataframe)
+#import(dataframe)
 import(stats)
+import(grDevices)
 import(class)
 import(nnet)
 import(MASS)
 import(e1071)
 import(randomForest)
-import(RWeka)
 import(ipred)
-#import(RColorBrewer)
-#importFrom(gplots, heatmap.2)
-import(grDevices)
+#import(klaR)
+#import(RWeka)
 
-#importFrom(gdata, combine)
-
 export(mlearning)
 export(mlLda)
 export(mlQda)
@@ -20,7 +17,7 @@
 export(mlLvq)
 export(mlNnet)
 export(mlNaiveBayes)
-export(mlNaiveBayesWeka)
+#export(mlNaiveBayesWeka)
 
 export(cvpredict)
 export(summary.lvq)
@@ -28,7 +25,13 @@
 export(response)
 export(train)
 
+export("weights<-")
+
 export(confusion)
+export(confusionImage)
+export(confusionBarplot)
+export(confusionStars)
+export(confusionDendrogram)
 
 S3method(confusion, default)
 S3method(confusion, mlearning)
@@ -47,6 +50,9 @@
 S3method(response, default)
 S3method(train, default)
 
+S3method(weights, confusion)
+S3method("weights<-", confusion)
+
 S3method(summary, lvq)
 S3method(print, summary.lvq)
 
@@ -56,7 +62,7 @@
 S3method(mlLvq, default)
 S3method(mlNnet, default)
 S3method(mlNaiveBayes, default)
-S3method(mlNaiveBayesWeka, default)
+#S3method(mlNaiveBayesWeka, default)
 
 S3method(mlLda, formula)
 S3method(mlQda, formula)
@@ -64,7 +70,7 @@
 S3method(mlLvq, formula)
 S3method(mlNnet, formula)
 S3method(mlNaiveBayes, formula)
-S3method(mlNaiveBayesWeka, formula)
+#S3method(mlNaiveBayesWeka, formula)
 
 S3method(predict, mlLda)
 S3method(predict, mlQda)

Modified: pkg/mlearning/R/confusion.R
===================================================================
--- pkg/mlearning/R/confusion.R	2012-08-02 19:23:08 UTC (rev 228)
+++ pkg/mlearning/R/confusion.R	2012-08-04 23:42:45 UTC (rev 229)
@@ -1,182 +1,302 @@
-## TODO: add the possibility to droplevels() in confusion object!... or in print()/plot()?
 confusion <- function (x, ...)
 	UseMethod("confusion")
 
-.confusion <- function (classes, labels, ...)
+.confusion <- function (classes, labels, weights, ...)
 {
-	if (!length(labels)) {
-		labels <- c("Predicted", "Actual")
-	} else {
-		labels <- as.character(labels)
-		if (length(labels) != 2)
-			stop("You must provide exactly 2 character strings for 'labels'")
-	}
-	## Make sure both variables are correctly named
-	names(classes) <- labels
-	## How many objects by level?
-	NbrPerClass1 <- table(classes[, 1])
-	## How many predicted objects
-	NbrPerClass2 <- table(classes[, 2])
-	## Confusion matrix
-	Conf <- table(classes)
-	## Further stats: total, true positives, accuracy
-	Total <- sum(Conf)
-	TruePos <- sum(diag(Conf))
-	Stats <- c(total = Total, truepos = TruePos,
-		accuracy = TruePos / Total * 100)
-
-	## Change labels to get a more compact presentation
-	colnames(Conf) <- formatC(1:ncol(Conf), digits = 1, flag = "0")
-	rownames(Conf) <- paste(colnames(Conf), rownames(Conf))
-
+	res <- table(classes, dnn = labels)
+	total <- sum(res)
+	truePos <- sum(diag(res))
+	row.freqs <- rowSums(res)	
+	
 	## Additional data as attributes
-	attr(Conf, "stats") <- Stats
-	attr(Conf, "nbr.rows") <- NbrPerClass1
-	attr(Conf, "nbr.cols") <- NbrPerClass2
+	attr(res, "row.freqs") <- row.freqs
+	attr(res, "col.freqs") <- colSums(res)
+	attr(res, "levels") <- levels(classes[1, ]) # These are *initial* levels!
+	## Final levels may differ if there are empty levels, or NAs!
+	attr(res, "weights") <- row.freqs # Initial weights are row.freqs
+	attr(res, "stats") <- c(total = total, truepos = truePos,
+		error = 1 - (truePos / total))
 	
-	## This is a confusion object
-	class(Conf) <- c("confusion", "table")
-	Conf
+	## This is a confusion object, inheriting from table
+	class(res) <- c("confusion", "table")
+	
+	## Do we reweight the confusion matrix?
+	if (!missing(weights)) weights(res) <- weights
+	
+	res
 }
 	
 confusion.default <- function (x, y = NULL, vars = c("Actual", "Predicted"),
-labels = vars, merge.by = "Id", ...)
+labels = vars, merge.by = "Id", weights, ...)
 {	
 	## If the object is already a 'confusion' object, return it
 	if (inherits(x, "confusion")) {
 		if (!missing(y))
-			warning("you cannot provide 'y' when 'x' is already a 'confusion' object")
+			warning("you cannot provide 'y' when 'x' is a 'confusion' object")
+		## Possibly reweight it
+		if (!missing(weights)) weights(x) <- weights		
 		return(x)
 	}
 	
 	## Idem if there is a 'confusion' attribute and no y
 	conf <- attr(x, "confusion")
-	if (!is.null(conf) && missing(y)) return(conf)
-	
-	## Check/convert vars and labels
-	if (!length(vars)) {
-		vars <- c("Class", "Ident")
-	} else {
-		vars <- as.character(vars)
-		if (length(vars) != 2)
-			stop("You must provide exactly 2 strings for 'vars'")
-	}
+	if (!is.null(conf) && missing(y)) {
+		## Possibly reweight it
+		if (!missing(weights)) weights(conf) <- weights
+		return(conf)
+	}	
+
+	## Reworks and check arguments
+	vars <- as.character(vars)
+	if (length(vars) != 2)
+		stop("You must provide exactly 2 strings for 'vars'")
 	merge.by <- as.character(merge.by)
 	
 	## There are three possibilities:
-	## 1) a single data frame => use vars
+	## 1) A single data frame => use vars
 	if (missing(y)) {
 		## Special case of a data frame or list of two factors: keep as it is
 		if (is.list(x) && length(x) == 2 && is.null(vars)) {
 			clCompa <- as.data.frame(x)
-			labels <- names(clCompa)
+			if (missing(labels)) labels <- names(clCompa)
 		} else {
 			x <- as.data.frame(x)
-			## Check that vars exist and levels of two vars do match
+			## Check that vars exist
 			if (is.null(names(x)) || !all(vars %in% names(x)))
 				stop("'vars' are not among column names of 'x'")
-			if (!all(sort(levels(x[[vars[1]]])) == sort(levels(x[[vars[2]]]))))
-				stop("the levels of the two variables in 'x' do not match")
+			## Check that levels of two vars do match
+			lev1 <- levels(x[[vars[1]]])
+			lev2 <- levels(x[[vars[2]]])
+			if (!all(lev1 == lev2)) {
+				## If difference is only in the order of both levels, reorder #2
+				if (!all(sort(lev1) == sort(lev2))) {
+					stop("levels of the two variables in 'x' do not match")
+				} else x[[vars[2]]] <- factor(as.character(x[[vars[2]]]),
+					levels = lev1)
+			}
 			clCompa <- data.frame(class1 = x[[vars[1]]], class2 = x[[vars[2]]])
 		}
 	} else { # y is provided
-		## 2) two vectors of factors to compare (must have same length/same levels)
+		## 2) Two vectors of factors (must have same length/same levels)
 		if (is.factor(x) && is.factor(y)) {
+			## Check length match
 			if (length(x) != length(x))
-				stop("not the same number of items in 'x' and 'y'")
-			if (!all(sort(levels(x))  == sort(levels(y))))
-				stop("'x' and 'y' levels do not match")
+				stop("lengths of 'x' and 'y' are not the same")
+			## Check levels match
+			lev1 <- levels(x)
+			lev2 <- levels(y)
+			if (!all(lev1  == lev2)) {
+				## If difference is only in the order of both levels, reorder #2
+				if (!all(sort(lev1)  == sort(lev1))) {
+					stop("'x' and 'y' levels do not match")
+				} else y <- factor(as.character(y), levels = lev1)
+			}
 			clCompa <- data.frame(class1 = y, class2 = x)
 		} else {
-			## 3) two data frames => merge first, then use vars
-			## Check levels match
-			## Note: if one is a subset of the other,
-			## would it be possible to match them???
+			## 3) Two data frames => merge first, then use vars
+			## Check vars exist
 			if (is.null(names(x)) || !(vars[1] %in% names(x)))
 				stop("first item of 'vars' is not among names of 'x'")
 			if (is.null(names(y)) || !(vars[2] %in% names(y)))
 				stop("second item of 'vars' is not among names of 'y'")
-			if (!all(sort(levels(x[[vars[1]]]))  == sort(levels(y[[vars[2]]]))))
-				stop("levels of the  two variables in 'x' and 'y' do not match")
+			## Check that levels of two vars do match
+			lev1 <- levels(x[[vars[1]]])
+			lev2 <- levels(y[[vars[2]]])
+			if (!all(lev1  == lev2)) {
+				## If difference is only in the order of both levels, reorder #2
+				if (!all(sort(lev1)  == sort(lev2))) {
+					stop("levels of the variables in 'x' and 'y' do not match")
+				} else x[[vars[2]]] <- factor(as.character(x[[vars[2]]]),
+					levels = lev1)
+			}
 			## Merge data according to merge.by
-			clCompa <- merge(y[, c(vars[2], merge.by)], x[, c(vars[1], merge.by)],
-				by = merge.by)
-			clCompa <- clCompa[, c(ncol(clCompa) - 1, ncol(clCompa))]
+			clCompa <- merge(y[, c(vars[2], merge.by)],
+				x[, c(vars[1], merge.by)], by = merge.by)
+			nc <- ncol(clCompa)
+			clCompa <- clCompa[, c(nc - 1, nc)]
 			## Are there common objects left?
-			if (nrow(clCompa) == 0)
-				stop("no common objects between 'x' and 'y'")
+			if (!nrow(clCompa)) stop("no common objects between 'x' and 'y'")
 		}
 	}
 	
-	.confusion(clCompa, labels, ...)
+	## Construct the confusion object
+	if (missing(weights)) {
+		.confusion(classes = clCompa, labels = labels, ...)
+	} else {
+		.confusion(classes = clCompa, labels = labels, weights = weights, ...)
+	}
 }
 
 confusion.mlearning <- function (x, y = response(x),
-labels = c("Actual", "Predicted"), ...)
-	.confusion(data.frame(class1 = y, class2 = predict(x, ...)),
-		labels = labels, ...)
+labels = c("Actual", "Predicted"), weights, ...) {
+	## Check labels
+	labels <- as.character(labels)
+	if (length(labels) != 2)
+		stop("You must provide exactly 2 character strings for 'labels'")
+	
+	## Extract class2 by using predict on the mlearning object
+	class2 <- predict(x, ...)
+	
+	## Check that both variables are of same length and same levels
+	if (length(y) != length(class2))
+		stop("lengths of 'x' and 'y' are not the same")
+	lev1 <- levels(y)
+	lev2 <- levels(class2)
+	if (!all(lev1  == lev2)) {
+		## If difference is only in the order of both levels, reorder #2
+		if (!all(sort(lev1)  == sort(lev2))) {
+			stop("levels of 'x' and 'y' do not match")
+		} else class2 <- factor(as.character(class2), levels = lev1)
+	}
+	
+	## Construct the confusion object
+	if (missing(weights)) {
+		.confusion(data.frame(class1 = y, class2 = class2),
+			labels = labels, ...)
+	} else {
+		.confusion(data.frame(class1 = y, class2 = class2),
+			labels = labels, weights = weights, ...)
+	}
+}
 
-print.confusion <- function (x, error.col = TRUE, ...)
+weights.confusion <- function (object, ...)
+	attr(object, "weights")
+
+`weights<-`<- function (object, ..., value)
+	UseMethod("weights<-")
+
+`weights<-.confusion`<- function (object, ..., value)
 {
+	if (!length(value)) { # value is NULL or of zero length
+		## Reset weights to original frequencies
+		value <- attr(object, "row.freqs")
+		attr(object, "weights") <- value
+		round(object / apply(object, 1, sum) * value)
+	
+	} else if (is.numeric(value)) { # value is numeric
+		
+		if (length(value) == 1) { # value is a single number
+			if (is.na(value) || !is.finite(value) || value <= 0)
+				stop("value must be a finite positive number")
+			res <- object / apply(object, 1, sum) * as.numeric(value)
+		
+		} else { # value is a vector of numerics
+			## It must be either of the same length as nrow(object) or of
+			## levels(objects)
+			l <- length(value)
+			n <- names(value)
+			l2 <- levels(object)
+			
+			if (l == nrow(object)) {
+				## If the vector is named, check names and possibly reorder it
+				if (length(n))
+					if (all(n %in% rownames(object))) {
+						value <- value[rownames(object)]
+					} else stop("Names of the values do not match levels in the confusion matrix")
+			
+			} else if (l == length(l2)) {
+				## Assume names as levels(object), if they are not provides
+				if (!length(n)) names(value) <- n <- l2
+				
+				## If the vector is named, check names match levels
+				if (length(n))
+					if (all(n %in% l2)) {
+						## Extract levels used in the confusion matrix
+						value <- value[rownames(object)]
+					} else stop("Names of the values do not match levels in the confusion matrix")
+
+			} else stop("length of 'value' do not match the number of levels in the confusion matrix")	
+			
+			res <- object / apply(object, 1, sum) * as.numeric(value)
+		}
+		attr(res, "weights") <- rowSums(res)
+		res
+		
+	} else stop("value must be a numeric vector, a single number or NULL")
+}
+
+print.confusion <- function (x, sums = TRUE, error.col = sums, digits = 0,
+sort = "ward", ...)
+{
 	## General stats on the confusion matrix
 	Stats <- attr(x, "stats")
+	Error <- round(Stats["error"] * 100, 1)
 	cat(Stats["total"], " items classified with ", Stats["truepos"],
-		" true positives (", round(Stats["accuracy"], 1), "% accuracy)\n",
+		" true positives (error rate = ", Error, "%)\n",
 		sep = "")
+	row.freqs <- attr(x, "row.freqs")
+	if (!all(attr(x, "weights") == row.freqs)) {
+		cat("with initial row weights (frequencies):\n")
+		print(row.freqs)
+		cat("Reweighted to:\n")
+	}
 	
 	## Print the confusion matrix itself
 	X <- x
 	class(X) <- "table"
+
+    n <- ncol(X)
+
+	## Do we sort items?
+	if (length(sort) && !is.na(sort) && sort != FALSE && sort != "") {
+		## Grouping of items
+		confuSim <- X + t(X)
+		confuSim <- 1 - (confuSim / sum(confuSim) * 2)
+		confuDist <- structure(confuSim[lower.tri(confuSim)], Size = n,
+			Diag = FALSE, Upper = FALSE, method = "confusion", call = "",
+			class = "dist")
+		order <- hclust(confuDist, method = sort)$order
+		X <- X[order, order]
+	}
+	
+	## Change row and column names to a more compact representation
+	nbrs <- formatC(1:ncol(X), digits = 1, flag = "0")
+	colnames(X) <- nbrs
+	rownames(X) <- paste(nbrs, rownames(X))
+	
+	## Add sums?
+	if (isTRUE(as.logical(sums))) {
+		## Calculate error (%)
+		ErrorTot <- (1 - (sum(diag(x)) / sum(x))) * 100
+		Errors <- as.integer(round(c((1 - diag(X) / apply(X, 1, sum)) * 100,
+			ErrorTot), 0))
+		## ... and add row and column sums
+		X <- addmargins(X, FUN = list(`(sum)` = sum), quiet = TRUE)
+	} else Errors <- as.integer(round((1 - diag(X) / apply(X, 1, sum)) * 100, 0))
+	
+	## Add class errors?
 	if (isTRUE(as.logical(error.col))) {
-		print(cbind(X, `Error (FNR)` = round((1 - diag(X) / apply(X, 1, sum)), 3)))
-	} else print(X)
+		X <- as.table(cbind(X, `(FNR%)` = Errors))
+		dn <- dimnames(X)
+		names(dn) <- names(dimnames(x))
+		dimnames(X) <- dn
+	}
+	print(round(X, digits))
 	
 	## Return the original object invisibly
 	invisible(x)
 }
 
+## TODO: a precision-recall diagram for all groups with F1-score lines
 plot.confusion <- function (x, y = NULL,
 type = c("image", "barplot", "stars", "dendrogram"), stat1 = "Recall",
-stat2 = "Precision", ...)
+stat2 = "Precision", names, ...)
 {
 	if (is.null(y)) type <- match.arg(type)[1] else type <- "stars"
+	if (missing(names)) names <- c(substitute(x), substitute(y))
 	res <- switch(type,
-		image = .confusionImage(x, y, ...),
-		barplot = .confusionBar(x, y, ...),
-		stars = .confusionStars(x, y, stat1 = stat1, stat2 = stat2, ...),
-		dendrogram = .confusionDendro(x, y, ...),
+		image = confusionImage(x, y, ...),
+		barplot = confusionBarplot(x, y, ...),
+		stars = confusionStars(x, y, stat1 = stat1, stat2 = stat2, names, ...),
+		dendrogram = confusionDendrogram(x, y, ...),
 		stop("'type' must be 'image', 'barplot', 'stars' or 'dendrogram'"))
 	invisible(res)
 }
 
-## These functions do the respective graphs for confusion objects
-## Old (simpler) version
-#.confusionImage <- function (x, y = NULL, col = heat.colors(10),
-#mar = c(5.1, 12.1, 4.1, 2.1), ...)
-#{
-#	if (!inherits(x, "confusion"))
-#		stop("'x' must be a 'confusion' object")
-
-#	if (!is.null(y))
-#		stop("cannot use a second classifier 'y' for this plot")
-#	omar  <- par("mar")
-#	on.exit(par(omar))
-#    par(mar = mar)
-#	n <- ncol(x)
-#	image(1:n, 1:n, 1 / (t(x[n:1, 1:n])), col = col, xlab = "", ylab = "",
-#		xaxt = "n", yaxt = "n", ...)
-#    axis(1, at = 1:n, las = 2)
-#    axis(2, at = n:1, labels = paste(names(attr(x, "nbr.cols")), 1:n),
-#		las = 1)
-#    abline(h = (1:(n + 1)) - 0.5, lty = 2, col = "gray")
-#    abline(v = (1:(n + 1)) - 0.5, lty = 2, col = "gray")
-#	invisible(x)
-#}
-
 ## Representation of the confusion matrix
-.confusionImage <- function (x, y = NULL, mar = c(3.1, 10.1, 3.1, 3.1), asp = 1, 
-label = "Actual \\ Predicted", sort = "complete", cex = 1, colfun = NULL,
-ncols = 41, col0 = FALSE, grid.col = "gray", ...)
+confusionImage <- function (x, y = NULL, labels = names(dimnames(x)),
+sort = "ward", numbers = TRUE, digits = 0, mar = c(3.1, 10.1, 3.1, 3.1),
+cex = 1, asp = 1, colfun, ncols = 41, col0 = FALSE, grid.col = "gray", ...)
 {
 	if (!inherits(x, "confusion"))
         stop("'x' must be a 'confusion' object")
@@ -184,100 +304,95 @@
 	if (!is.null(y))
 		stop("cannot use a second classifier 'y' for this plot")
 	
+	## Default labels in case none provided
+	if (is.null(labels)) labels <- c("Actual", "Predicted")
+	
 	## Default color function
-	if (!length(colfun)) colfun <- function (n, alpha = 1, s = 0.9, v = 0.9) {
+	## (greens for correct values, reds for errors, white for zero)
+	if (missing(colfun)) colfun <- function (n, alpha = 1, s = 0.9, v = 0.9) {
 		if ((n <- as.integer(n[1L])) <= 0) return(character(0L))
-		## Define the initial (red) and final (blue) colors with white in between
+		## Initial (red) and final (green) colors with white in between
 		cols <- c(hsv(h = 0, s = s, v = v, alpha = alpha),   # Red
 				  hsv(h = 0, s = 0, v = v, alpha = alpha),   # White
-				  hsv(h = 2/3, s = s, v = v, alpha = alpha)) # Blue
-		## Use a color ramp from red to white to blue
+				  hsv(h = 2/6, s = s, v = v, alpha = alpha)) # Green
+		## Use a color ramp from red to white to green
 		return(colorRampPalette(cols)(n))
 	}
 	
-    manuLev <- sub("...", "", rownames(x))
-    autoLev <- manuLev
     n <- ncol(x)
 
 	## Do we sort items?
 	if (length(sort) && !is.na(sort) && sort != FALSE && sort != "") {
 		## Grouping of items
 		confuSim <- x + t(x)
-		confuSim <- max(confuSim) - confuSim
+		confuSim <- 1 - (confuSim / sum(confuSim) * 2)
 		confuDist <- structure(confuSim[lower.tri(confuSim)], Size = n,
 			Diag = FALSE, Upper = FALSE, method = "confusion", call = "",
 			class = "dist")
 		order <- hclust(confuDist, method = sort)$order
 		x <- x[order, order]
-		autoLev <- autoLev[order]
-		manuLev <- manuLev[order]
 	}
-	## Recode levels so that a number is used in front of manu labels
-	## and shown in auto
-	autoLev <- formatC(1:length(autoLev), width = 2, flag = "0")
-	manuLev <- paste(manuLev, autoLev, sep = "-")
-	row.names(x) <- manuLev
-	colnames(x) <- autoLev
-	## Calculate colors (use a transfo to get 0, 1, 2, 3, 4, 7, 10, 15, 25+)
+	
+	## Recode row and column names for more compact display
+	colnames(x) <- names2 <- formatC(1:n, digits = 1, flag = "0")
+	rownames(x) <- names1 <- paste(names2, rownames(x))
+	
+	## Transform for better colorization
+	## (use a transfo to get 0, 1, 2, 3, 4, 7, 10, 15, 25+)
 	confuCol <- x
 	confuCol <- log(confuCol + .5) * 2.33
 	confuCol[confuCol < 0] <- if (isTRUE(as.logical(col0))) 0 else NA
 	confuCol[confuCol > 10] <- 10
-	## Negative values (in blue) on the diagonal (correct IDs)
+	
+	## Negative values (in green) on the diagonal (correct IDs)
 	diag(confuCol) <- -diag(confuCol)	
+	
 	## Make an image of this matrix
 	opar <- par(no.readonly = TRUE)
 	on.exit(par(opar))
 	par(mar = mar, cex = cex)
 	image(1:n, 1:n, -t(confuCol[nrow(confuCol):1, ]), zlim = c(-10, 10),
 		asp = asp, bty = "n", col = colfun(ncols), xaxt = "n", yaxt = "n",
-		xlab = "", ylab = "", main = "", ...)
-	## Print the actual numbers
-	confuTxt <- as.character(x[n:1, ])
-	confuTxt[confuTxt == "0"] <- ""
-	text(rep(1:n, each = n), 1:n, labels = confuTxt)
-	## The grid
-	abline(h = 0:n + 0.5, col = grid.col)
-	abline(v = 0:n + 0.5, col = grid.col)
-	## The axis labels
-	axis(1, 1:n, labels = autoLev, tick =  FALSE, padj = 0)
-	axis(2, 1:n, labels = manuLev[n:1], tick =  FALSE, las = 1, hadj = 1)
-	axis(3, 1:n, labels = autoLev, tick =  FALSE) #, cex.lab = cex)
-	axis(4, 1:n, labels = autoLev[n:1], tick =  FALSE, las = 1, hadj = 0)
-	## Legend at top-left
-	mar[2] <- 1.1
-	par (mar = mar, new = TRUE)
-	plot(0, 0, type = "n", xaxt = "n", yaxt = "n", bty = "n")
-	mtext(label, adj = 0, line = 1, cex = cex)
+		xlab = "", ylab = "", ...)
+	
+	## Indicate the actual numbers
+	if (isTRUE(as.logical(numbers))) {
+		confuTxt <- as.character(round(x[n:1, ], digits = digits))
+		confuTxt[confuTxt == "0"] <- ""
+		text(rep(1:n, each = n), 1:n, labels = confuTxt)
+	}
+	
+	## Add the grid
+	if (length(grid.col)) {
+		abline(h = 0:n + 0.5, col = grid.col)
+		abline(v = 0:n + 0.5, col = grid.col)
+	}
+	
+	## Add the axis labels
+	axis(1, 1:n, labels = names2, tick =  FALSE, padj = 0)
+	axis(2, 1:n, labels = names1[n:1], tick =  FALSE, las = 1, hadj = 1)
+	axis(3, 1:n, labels = names2, tick =  FALSE)
+	axis(4, 1:n, labels = names2[n:1], tick =  FALSE, las = 1, hadj = 0)
+	
+	## Add labels at top-left
+	if (length(labels)) {
+		if (length(labels) != 2) stop("You must provide two labels")
+		mar[2] <- 1.1
+		par (mar = mar, new = TRUE)
+		plot(0, 0, type = "n", xaxt = "n", yaxt = "n", bty = "n")
+		mtext(paste(labels, collapse = " // "), adj = 0, line = 1, cex = cex)
+	}
+	
 	## Return the confusion matrix, as displayed, in text format
 	invisible(x)
 }
 
-## Eliminated to avoid dependency on RColorBrewer and gplots!
-#.confusionTree <- function (x, y = NULL, maxval = 10, margins = c(2, 10),
-#row.v = TRUE, col.v = TRUE, ...)
-#{
-#	if (!inherits(x, "confusion"))
-#		stop("'x' must be a 'confusion' object")
-#	if (!is.null(y))
-#		stop("cannot use a second classifier 'y' for this plot")
-#
-#	nX <- nrow(x)
-#	nY <- ncol(x)
-#	nZ <- nX * nY
-#	confmat <- pmin(x, maxval)
-#	mypalette <- brewer.pal(maxval - 1, "Spectral")
-#	heatmap.2(x, col= c(0, mypalette), symm = TRUE, margins = margins,
-#		trace = "both", Rowv = row.v, Colv = col.v, cexRow = 0.2 + 1 / log10(nX),
-#		cexCol = 0.2 + 1 / log10(nY), tracecol = "Black", linecol = FALSE, ...)
-#}
-
-# Confusion bar with recall and precision in green bar and not outside as before
-# function modified for publication hclust 
-.confusionBar <- function (x, y = NULL,
+## Confusion barplot with recall and precision in green bars
+## TODO: various bar rescaling possibilities!!!
+confusionBarplot <- function (x, y = NULL,
 col = c("PeachPuff2", "green3", "lemonChiffon2"), mar = c(1.1, 8.1, 4.1, 2.1),
-cex = 0.7, cex.axis = cex, cex.legend = cex, main = "Precision versus Recall",
-min.width = 17, ...)
+cex = 1, cex.axis = cex, cex.legend = cex,
+main = "F-score (precision versus recall)", numbers = TRUE, min.width = 17, ...)
 {
     if (!inherits(x, "confusion"))
         stop("'x' must be a 'confusion' object")
@@ -285,143 +400,89 @@
 	if (!is.null(y))
 		stop("cannot use a second classifier 'y' for this plot")
 	
-    TP <- diag(x)
-    fn <- rowSums(x) - TP
-    fp <- colSums(x) - TP
-    FN <- fn <- fn/(fn + TP)
-    FP <- fp <- fp/(TP + fp)
-    FP[is.na(FP)] <- 1
-    tp <- 1 - fn
-    fp <- tp/(1 - fp) * fp
-    scale <- fn + tp + fp
-    fn <- fn/scale * 100
-    tp <- tp/scale * 100
-    fp <- fp/scale * 100
-    fn[is.na(tp)] <- 50
+	## F-score is 2 * recall * precision / (recall + precision), ... but also
+	## F-score = TP / (TP + FP/2 + FN/2). We represent this in a barplot
+	TP <- tp <- diag(x)
+	FP <- fp <- colSums(x) - tp
+	FN <- fn <- rowSums(x) - tp
+	## In case we have missing data...
+	fn[is.na(tp)] <- 50
     fp[is.na(tp)] <- 50
     tp[is.na(tp)] <- 0
-    res <- matrix(c(fp, tp, fn), ncol = 3)
-    colnames(res) <- c("fp", "tp", "fn")
-    Labels <- names(attr(x, "nbr.cols"))
-    pos <- order(res[, 2], decreasing = TRUE)
+
+	## We scale these values, so that the sum fp/2 + tp + fn/2 makes 100
+	scale <- fp/2 + tp + fn/2
+    res <- matrix(c(fp/2 / scale * 100, tp / scale * 100, fn/2 / scale * 100),
+		ncol = 3)
+    colnames(res) <- c("FPcontrib", "Fscore", "FNcontrib") # In %
+    Labels <- names(attr(x, "col.freqs"))
+    
+	## The graph is ordered in decreasing F-score values
+	pos <- order(res[, 2], decreasing = TRUE)
     res <- res[pos, ]
     FN <- FN[pos]
     FP <- FP[pos]
     TP <- TP[pos]
     Labels <- Labels[pos]
-    L <- length(FN)
-    omar <- par("mar")
+    l <- length(FN)
+    
+	## Plot the graph
+	omar <- par("mar")
     on.exit(par(omar))
     par(mar = mar)
-	## Plot the graph
-    barplot(t(res), horiz = TRUE, col = col, xaxt = "n", las = 1, space = 0, ...)
-    lines(c(50, 50), c(0, L), lwd = 1)
-    xpos <- res[, 1] + res[, 2]/2
-    text(xpos, 1:L - 0.5, round(TP), adj = c(0.5, 0.5), cex = cex)
-    # Add recall and precision if enough place to print it...
-    NotPlace <- res[,"tp"] <= min.width
-    if (any(NotPlace)) {
-		## Special case if not enough place to print precision and recall
-		## Add Precision
-		PrecTxt <- paste(round((1 - FP) * 100), "%", sep = "")
-		PrecTxt[NotPlace] <- ""
-		text(res[,"fp"] + 1, 1:L - 0.5, PrecTxt, adj = c(0, 0.5), cex = cex)
-		## Add FDR
-		FDTxt <- paste(round((FP) * 100), "%", sep = "")
-		FDTxt[!NotPlace] <- ""
-		text(rep(1, length(FP)), 1:L - 0.5, FDTxt, adj = c(0, 0.5), cex = cex)
-		## Add Recall
-		RecTxt <- paste(round((1 - FN) * 100), "%", sep = "")
-		RecTxt[NotPlace] <- ""
-		text(res[,"fp"] + res[, "tp"] - 5, 1:L - 0.5, RecTxt, adj = c(0, 0.5),
-			cex = cex)
-		## Add FN
-		FNTxt <- paste(round((FN) * 100), "%", sep = "")
-		FNTxt[!NotPlace] <- ""
-		text(rep(99, length(FN)), 1:L - 0.5, FNTxt, adj = c(1, 0.5), cex = cex)
-    } else {
-		## Add Precision
-		text(res[,"fp"] + 1, 1:L - 0.5, paste(round((1 - FP) *
-			100), "%", sep = ""), adj = c(0, 0.5), cex = cex)
-		## Add Recall
-		text(res[,"fp"] + res[, "tp"] - 5, 1:L - 0.5, paste(round((1 - FN) *
-			100), "%", sep = ""), adj = c(0, 0.5), cex = cex)
-    }
-    legend(50, L * 1.05, legend = c("False Discovery : (1 - Prec.)",
-		"True Positive (TP)", "False Negative : (1 - Rec.)"), cex = cex.legend,
-		xjust = 0.5, yjust = 1, fill = col, bty = "n", horiz = TRUE)
-    axis(2, 1:L - 0.5, tick = FALSE, las = 1, cex.axis = cex.axis,
-		labels = Labels)
-    title(main = main)
-    text(50, -0.5, "< higher precision : TP/(TP+FP) - underestimate <=> overestimate - higher recall : TP/(TP+FN) >  ",
-        cex = cex)
+    ## The barplot
+	barplot(t(res), horiz = TRUE, col = col, xaxt = "n", las = 1, space = 0,
+		main = main, ...)
+    ## The line that shows where symmetry is
+	lines(c(50, 50), c(0, l), lwd = 1)
+    
+	## Do we add figures into the plot?
+	if (isTRUE(as.logical(numbers))) {
+		## F-score is written in the middle of the central bar
+		xpos <- res[, 1] + res[, 2] / 2
+		text(xpos, 1:l - 0.5, paste("(", round(res[, 2]), "%)", sep = ""),
+			adj = c(0.5, 0.5), cex = cex)
+		
+		## Add the number of FP and FN to the left and right, respectively
+		text(rep(1, l), 1:l - 0.5, round(FP), adj = c(0, 0.5), cex = cex)
+		text(rep(99, l), 1:l - 0.5, round(FN), adj = c(1, 0.5), cex = cex)
+	}
+
+    ## Add a legend (if cex.legend is not NULL)
+	if (length(cex.legend)) {
+		legend(50, l * 1.05, legend = c("False Positives",
+			"2*TP (F-score %)", "False Negatives"), cex = cex.legend, xjust = 0.5, yjust = 1,
+			fill = col, bty = "n", horiz = TRUE)
+	}
+    
+	## Add axes if cex.axis is not NULL
+	if (length(cex.axis))
+		axis(2, 1:l - 0.5, tick = FALSE, las = 1, cex.axis = cex.axis,
+			labels = Labels)
+    
     invisible(res)
 }
 
-## Precision vs Recall, alternate presentation
-## Note used, but saved for now
-#.confusionBar <- function (x, y = NULL,
-#col = c("PeachPuff2", "green",  "green3", "lemonChiffon2"),
-#mar = c(2.1, 8.1, 4.1, 2.1), cex = 0.7, cex.axis = cex, cex.legend = cex,
-#main = "Precision versus Recall", ...)
-#{
-#	if (!inherits(x, "confusion"))
-#		stop("'x' must be a 'confusion' object")
-#	if (!is.null(y))
-#		stop("cannot use a second classifier 'y' for this plot")
-#
-#	## Calculation of statistics
-#	Stats <- summary(x)
-#	FDR <- Stats$FDR * 100
-#	Precision <- Stats$Precision * 100
-#	Recall <- Stats$Recall * 100
-#	FNR <- Stats$FNR * 100
-#	## Order statistics according to Precision + recall
-#	pos <- order(Recall + Precision, decreasing = TRUE)
-#	## Results to plot
-#	res <- cbind(FDR, Precision, Recall, FNR)
-#	## Do the plot
-#	omar <- par("mar")
-#	on.exit(par(omar))
-#	par(mar = mar)
-#	barplot(t(res[pos, ]), horiz = TRUE, col = col, xaxt = "n", las = 1,
-#		space = 0, ...)
-#	## Add information
-#	n <- nrow(Stats)
-#	Labels <- names(attr(x, "nbr.cols"))
-#	axis(2, 1:n - 0.5, tick = FALSE, las = 1, cex.axis = cex.axis,
-#		labels = Labels[pos])
-#	title(main = main)
-#	text(rep(1, n), 1:n - 0.5, paste(round(FDR[pos]), "%", sep = ""),
-#		adj = c(0, 0.5), cex = cex)
-#	text(FDR[pos] + Precision[pos]/2, 1:n - 0.5, paste(round(Precision[pos]),
-#		"%", sep = ""), adj = c(0, 0.5), cex = cex)
-#	text(FDR[pos] + Precision[pos] + Recall[pos]/2, 1:n - 0.5,
-#		paste(round(Recall[pos]), "%", sep = ""), adj = c(0, 0.5), cex = cex)
-#	text(rep(191, n), 1:n - 0.5, paste(round(FNR[pos]), "%", sep = ""),
-#		adj = c(0, 0.5), cex = cex)
-#	legend("top", legend = c("False Discovery ", "Precision", "Recall",
-#		"False Negative"), cex = cex.legend, fill = col, bty = "n", horiz = TRUE)
-#	text(96, -0.5, "< higher precision - underestimate <=> overestimate - higher recall >  ",
-#		cex = cex)
-#	invisible(res)
-#}
-
-.confusionStars <- function(x, y = NULL, stat1 = "Recall", stat2 = "Precision",
-main = NULL, col = NULL, ...)
+## TODO: check the box around the legend
+confusionStars <- function(x, y = NULL, stat1 = "Recall", stat2 = "Precision",
+names, main, col = c("green2", "blue2", "green4", "blue4"), ...)
 {
-    if (!inherits(x, "confusion"))
+    ## Check objects
+	if (!inherits(x, "confusion"))
         stop("'x' must be a 'confusion' object")
     if (!is.null(y) && !inherits(x, "confusion"))
         stop("'y' must be NULL or a 'confusion' object")
 	
+	## Check stats
 	SupportedStats <- c("Recall", "Precision", "Specificity",
         "NPV", "FPR", "FNR", "FDR", "FOR")
     if (!stat1 %in% SupportedStats)
-        stop("stats1 must be one of followed stats: Recall, Precision, Specificity, NPV, FPR, FNR, FDR, FOR")
+        stop("stats1 must be one of Recall, Precision, Specificity, NPV, FPR, FNR, FDR, FOR")
     if (!stat2 %in% SupportedStats)
-        stop("stats2 must be one of followed stats: Recall, Precision, Specificity, NPV, FPR, FNR, FDR, FOR")
-    Blue <- topo.colors(16)
+        stop("stats2 must be one of Recall, Precision, Specificity, NPV, FPR, FNR, FDR, FOR")
+	
+	## Choose colors TODO: add a colors argument!
+	Blue <- topo.colors(16)
     Green <- terrain.colors(16)
     Stat <- summary(x)
     if (!is.null(y)) { # Comparison of two confusion matrices
@@ -430,32 +491,45 @@
 			Stat2[, stat2])
 		Data <- rbind(Data, rep(0, 4))
 		colnames(Data) <- paste(rep(c(stat1, stat2), each = 2), c(2, 1, 1, 2))
-		if (!length(main))
-			main <- paste("Groups comparison between classifier 1 and 2\nAccuracy 1 =",
-				round(attr(Stat, "Accuracy") * 100), "%, accuracy 2 =",
-				round(attr(Stat2, "Accuracy") * 100), "%")
-		if (!length(col))
-			col <- c("green", Green[1], Blue[2], Blue[6])
+		if (missing(main)) { # Calculate a suitable title
+			if (missing(names)) {
+				names <- c(substitute(x), substitute(y))
+			} else if (length(names) != 2)
+				stop("you must provide two nmaes for the two compared classifiers")
+			names <- as.character(names)
+			main <- paste("Groups comparison (1 =", names[1], ", 2 =",
+			names[2], ")")
+		}
+		if (length(col) >= 4) {
+			col <- col[c(3, 1, 2, 4)]
+		} else stop("you must provide four colors for the two statistics and the two classifiers")
 	} else { # Single confusion matrix
 		Data <- data.frame(Stat[, stat1], Stat[, stat2])
 		Data <- rbind(Data, rep(0, 2))
 		colnames(Data) <- c(stat1, stat2)
-		if (!length(main))
-			main <- paste("Groups comparison\nAccuracy =",
-				round(attr(Stat, "Accuracy") * 100), "%")
-		if (!length(col))
-			col <- c(Green[1], Blue[2])
+		if (missing(main))
+			main <- paste("Groups comparison")
+		if (length(col) >= 2) {
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/zooimage -r 229


More information about the Zooimage-commits mailing list