[Dplr-commits] r915 - in pkg/dplR: . R inst/unitTests man

Mon Nov 17 22:01:13 CET 2014

Author: mvkorpel
Date: 2014-11-17 22:01:12 +0100 (Mon, 17 Nov 2014)
New Revision: 915

Added:
   pkg/dplR/inst/unitTests/runit.utils.R
Modified:
   pkg/dplR/ChangeLog
   pkg/dplR/DESCRIPTION
   pkg/dplR/NAMESPACE
   pkg/dplR/R/latexify.R
   pkg/dplR/inst/unitTests/runit.dplR.R
   pkg/dplR/man/latexify.Rd
Log:
Tweaks to latexify():
* Now enforces NFC normalization (using package "stringi")
* Removes more whitespace when control characters are present
* In .Rd: new reference, formatting, technical details

Unit tests:
* Test of uuid.gen() was moved to a new file, runit.utils.R
* In the same file, there are new tests for latexify() and latexDate()


Modified: pkg/dplR/ChangeLog
===================================================================

--- pkg/dplR/ChangeLog	2014-11-11 09:14:55 UTC (rev 914)
+++ pkg/dplR/ChangeLog	2014-11-17 21:01:12 UTC (rev 915)
@@ -6,6 +6,7 @@
 - Added rasterPlot, latexify() and latexDate() to export list
 - Import readPNG from png.
 - Import more functions from grid.
+- Import stri_trans_nfc from stringi.
 
 File: DESCRIPTION
 -----------------
@@ -16,7 +17,7 @@
   math-dplR.pdf easier with openPDF().  RColorBrewer provides an
   alternative palette to an example in wavelet.plot.Rd.
 
-- New Imported package: png.
+- New Imported packages: png and stringi.
 
 File: chron.R
 -------------

Modified: pkg/dplR/DESCRIPTION
===================================================================
--- pkg/dplR/DESCRIPTION	2014-11-11 09:14:55 UTC (rev 914)
+++ pkg/dplR/DESCRIPTION	2014-11-17 21:01:12 UTC (rev 915)
@@ -3,7 +3,7 @@
 Type: Package
 Title: Dendrochronology Program Library in R
 Version: 1.6.1
-Date: 2014-11-11
+Date: 2014-11-17
 Authors at R: c(person("Andy", "Bunn", role = c("aut", "cph",
         "cre", "trl"), email = "andy.bunn at wwu.edu"), person("Mikko",
         "Korpela", role = c("aut", "trl")), person("Franco", "Biondi",
@@ -20,7 +20,7 @@
 Depends: R (>= 2.15.0)
 Imports: gmp (>= 0.5-2), graphics, grDevices, grid, stats, utils,
         digest (>= 0.2.3), lattice (>= 0.13-6), png (>= 0.1-1),
-        stringr (>= 0.4), XML (>= 2.1-0)
+        stringi, stringr (>= 0.4), XML (>= 2.1-0)
 Suggests: Biobase, dichromat (>= 1.2-3), foreach, forecast, iterators,
         knitr, RColorBrewer, RUnit (>= 0.4.25), tikzDevice, waveslim
 Description: This package contains functions for performing tree-ring

Modified: pkg/dplR/NAMESPACE
===================================================================
--- pkg/dplR/NAMESPACE	2014-11-11 09:14:55 UTC (rev 914)
+++ pkg/dplR/NAMESPACE	2014-11-17 21:01:12 UTC (rev 915)
@@ -22,6 +22,8 @@
 
 importFrom(png, readPNG)
 
+importFrom(stringi, stri_trans_nfc)
+
 importFrom(stringr, str_pad, str_trim)
 
 importFrom(utils, head, installed.packages, read.fwf, tail,

Modified: pkg/dplR/R/latexify.R
===================================================================
--- pkg/dplR/R/latexify.R	2014-11-11 09:14:55 UTC (rev 914)
+++ pkg/dplR/R/latexify.R	2014-11-17 21:01:12 UTC (rev 915)
@@ -37,12 +37,12 @@
         cat(y[encBytes], sep = "\n")
         y[encBytes] <- foo
     }
+    ## Remove control characters (not spaces!)
+    y <- gsub("(?![[:space:]])[[:cntrl:]]", "", y, perl=TRUE)
     ## Convert any sequence of whitespace to a single space.  This
     ## substitution must be done before control characters because
     ## newline belongs to both groups.
     y <- gsub("[[:space:]]+", " ", y)
-    ## Remove control characters
-    y <- gsub("[[:cntrl:]]", "", y)
     ## Escape LaTeX special characters.
     ## Source: Scott Pakin (2009) The Comprehensive LaTeX Symbol List.
     ## Accessible through "texdoc symbols".
@@ -72,16 +72,6 @@
     if (isTRUE(doublebackslash)) {
         y <- gsub("\\", "\\\\", y, fixed=TRUE)
     }
-    ## gsub() may have changed encodings. Therefore we check them
-    ## again.
-    encs <- Encoding(y)
-    encLatin <- which(encs == "latin1")
-    if (length(encLatin) > 0) {
-        y[encLatin] <- iconv(y[encLatin], from = "latin1", to = "UTF-8")
-    }
-    encUnknown <- which(encs == "unknown")
-    if (length(encUnknown) > 0) {
-        y[encUnknown] <- iconv(y[encUnknown], to = "UTF-8")
-    }
-    y
+    ## Convert result to UTF-8 NFC encoding
+    stri_trans_nfc(y)
 }

Modified: pkg/dplR/inst/unitTests/runit.dplR.R
===================================================================
--- pkg/dplR/inst/unitTests/runit.dplR.R	2014-11-11 09:14:55 UTC (rev 914)
+++ pkg/dplR/inst/unitTests/runit.dplR.R	2014-11-17 21:01:12 UTC (rev 915)
@@ -728,33 +728,3 @@
     checkTrue(is.nan(tbrm(seq.even, C=0)),
               msg="Robust mean of an even length sequence of consecutive integers is NaN when using a small C")
 }
-
-test.uuid.gen <- function() {
-    ## Setup
-    SAMP.SIZE <- 100
-    ugen <- uuid.gen()
-    uuids <- character(SAMP.SIZE)
-    for(i in seq_len(SAMP.SIZE))
-        uuids[i] <- ugen()
-    uuids.split <- strsplit(uuids, split="-", fixed=TRUE)
-    unique.nchar <- unique(t(sapply(uuids.split, nchar)))
-    unique.chars <-
-        unique(strsplit(paste(sapply(uuids.split, paste, collapse=""),
-                              collapse=""), split=character(0))[[1]])
-    all.4 <- unique(substr(uuids, 15, 15))
-    one.of.four <- unique(substr(uuids, 20, 20))
-    ## Test
-    checkEquals(SAMP.SIZE, length(unique(uuids)), msg="Unique IDs are unique")
-    checkTrue(all(nchar(uuids) == 36), msg="IDs have correct length")
-    checkTrue(all(sapply(uuids.split, length) == 5),
-              msg="IDs have 5 parts separated by dashes")
-    checkTrue(nrow(unique.nchar) == 1 &&
-              all(as.vector(unique.nchar) == c(8, 4, 4, 4, 12)),
-              msg="The parts have lengths 8, 4, 4, 4, and 12")
-    checkTrue(all(unique.chars %in% c(as.character(0:9), letters[seq_len(6)])),
-              msg="In addition to dashes, IDs only contain hexadecimal digits")
-    checkEquals("4", all.4,
-                msg="IDs have a constant character \"4\" in one position")
-    checkTrue(all(one.of.four %in% c("8", "9", "a", "b")),
-              msg="IDs have a restricted character (4/16 choices) in one position")
-}

Added: pkg/dplR/inst/unitTests/runit.utils.R
===================================================================
--- pkg/dplR/inst/unitTests/runit.utils.R	                        (rev 0)
+++ pkg/dplR/inst/unitTests/runit.utils.R	2014-11-17 21:01:12 UTC (rev 915)
@@ -0,0 +1,226 @@
+test.uuid.gen <- function() {
+    ## Setup
+    SAMP.SIZE <- 100
+    ugen <- uuid.gen()
+    uuids <- character(SAMP.SIZE)
+    for(i in seq_len(SAMP.SIZE))
+        uuids[i] <- ugen()
+    uuids.split <- strsplit(uuids, split="-", fixed=TRUE)
+    unique.nchar <- unique(t(sapply(uuids.split, nchar)))
+    unique.chars <-
+        unique(strsplit(paste(sapply(uuids.split, paste, collapse=""),
+                              collapse=""), split=character(0))[[1]])
+    all.4 <- unique(substr(uuids, 15, 15))
+    one.of.four <- unique(substr(uuids, 20, 20))
+    ## Test
+    checkEquals(SAMP.SIZE, length(unique(uuids)), msg="Unique IDs are unique")
+    checkTrue(all(nchar(uuids) == 36), msg="IDs have correct length")
+    checkTrue(all(sapply(uuids.split, length) == 5),
+              msg="IDs have 5 parts separated by dashes")
+    checkTrue(nrow(unique.nchar) == 1 &&
+              all(as.vector(unique.nchar) == c(8, 4, 4, 4, 12)),
+              msg="The parts have lengths 8, 4, 4, 4, and 12")
+    checkTrue(all(unique.chars %in% c(as.character(0:9), letters[seq_len(6)])),
+              msg="In addition to dashes, IDs only contain hexadecimal digits")
+    checkEquals("4", all.4,
+                msg="IDs have a constant character \"4\" in one position")
+    checkTrue(all(one.of.four %in% c("8", "9", "a", "b")),
+              msg="IDs have a restricted character (4/16 choices) in one position")
+}
+
+test.latexify <- function() {
+    ## Number of test strings
+    ## (including one "extra difficult" case and one empty string)
+    SAMP.SIZE <- 50
+    stopifnot(SAMP.SIZE >= 2)
+    MIN.LENGTH <- 1
+    MAX.LENGTH <- 1000
+    ## All ASCII characters except NUL (0)
+    characters <- rawToChar(as.raw(1:127), multiple = TRUE)
+    ## LaTeX special characters must be converted to commands
+    specialChars <-
+        c("{", "}", "\\", "#", "$", "%", "^", "&", "_", "~", "\"", "/")
+    specialStr <- paste(specialChars, collapse="")
+    ## latexify() is designed to convert any sequence of space
+    ## characters to a single regular space
+    spaceChars <- c("\t", "\n", "\v", "\f", "\r", " ")
+    spaceStr <- paste(spaceChars, collapse="")
+    ## latexify() is designed to drop control characters excluding spaces
+    controlChars <- setdiff(rawToChar(as.raw(c(1:31, 127)), multiple = TRUE),
+                            spaceChars)
+    controlStr <- paste(controlChars, collapse="")
+    ## Decide the length of each test string
+    stringLengths <- sample(MIN.LENGTH:MAX.LENGTH, SAMP.SIZE - 2)
+    nTotal <- sum(stringLengths)
+    ## Create the test strings:
+    ## * The last element is a "difficult case".
+    ## * The other elements consist of a random sample of characters.
+    strStop <- cumsum(stringLengths)
+    strStart <- strStop - (stringLengths - 1)
+    testStrings <-
+        c(substring(paste(sample(rep(characters, length.out = nTotal)),
+                          collapse=""), strStart, strStop),
+          paste(c(specialChars,
+                  rev(specialChars),
+                  rep(specialChars, each=3),
+                  paste(specialChars, " \t")),
+                collapse=""),
+          "")
+
+    ## Run latexify() on testStrings
+    ltxDouble <- latexify(testStrings, doublebackslash=TRUE)
+    ltxSingle <- latexify(testStrings, doublebackslash=FALSE)
+
+    ## Tests
+    checkEquals(ltxDouble,
+                gsub("\\", "\\\\", ltxSingle, fixed=TRUE, useBytes=TRUE),
+                msg="doublebackslash argument works as expected")
+    checkTrue(!any(grepl(sprintf("[%s]", controlStr),
+                         ltxSingle, useBytes=TRUE)),
+              msg="No control characters")
+    checkTrue(!any(grepl(sprintf("[%s]{2,}", spaceStr),
+                         ltxSingle, useBytes=TRUE)),
+              msg="Sequence of space characters collapses into one space")
+    checkTrue(!any(grepl("\\\\", ltxSingle, fixed=TRUE)),
+              msg="No line breaks (double backslash)")
+    Letters <- paste(c(LETTERS, letters), collapse="")
+    textCommand <- sprintf("\\\\[%s]+", Letters)
+    commandAndGroup <- paste(textCommand, "\\{\\}", sep="")
+    commandsAt <- gregexpr(commandAndGroup, ltxSingle)
+    checkEquals(lapply(gregexpr(textCommand, ltxSingle), as.vector),
+                lapply(commandsAt, as.vector),
+                msg="Command name is terminated with empty group")
+    escape <- sprintf("\\\\[^%s](\\{\\})?", Letters)
+    escapesAt <- gregexpr(escape, ltxSingle)
+
+    ## specialMap: record of special character -> command mapping
+    specialMap <- vector(mode="list", length = SAMP.SIZE)
+    ## Test that each test string was converted properly and prepare
+    ## specialMap
+    multiSpaceClass <- sprintf("[%s]+", spaceStr)
+    controlClass <- sprintf("[%s]", controlStr)
+    specialClass <- sprintf("[%s]", specialStr)
+    for (i in seq_len(SAMP.SIZE)) {
+        nChars <- nchar(ltxSingle[i])
+        ## ltxChars: split ltxSingle[i] into strings representing one
+        ## character each
+        if (nChars > 0) {
+            coms <- commandsAt[[i]]
+            escs <- escapesAt[[i]]
+            comLengths <- attr(coms, "match.length")
+            escLengths <- attr(escs, "match.length")
+            if (length(coms) == 1 && coms == -1) {
+                coms <- numeric(0)
+                comLengths <- numeric(0)
+            }
+            if (length(escs) == 1 && escs == -1) {
+                escs <- numeric(0)
+                escLengths <- numeric(0)
+            }
+            comsAndEscs <- c(coms, escs)
+            idx <- order(comsAndEscs)
+            comEsc <- comsAndEscs[idx]
+            comEscLen <- c(comLengths, escLengths)[idx]
+            nComEsc <- length(comEsc)
+            charIdx <- numeric(nChars)
+            prv <- 0
+            prvIdx <- 0
+            for (j in seq_along(comEsc)) {
+                thisStart <- comEsc[j]
+                nSingle <- thisStart - prvIdx - 1
+                charIdx[seq(from = prvIdx + 1, length.out = nSingle)] <-
+                    seq(from = prv + 1, length.out = nSingle)
+                prv <- prv + nSingle + 1
+                prvIdx <- thisStart + (comEscLen[j] - 1)
+                charIdx[thisStart:prvIdx] <- prv
+            }
+            nSingle <- nChars - prvIdx
+            charIdx[seq(from = prvIdx + 1, length.out = nSingle)] <-
+                seq(from = prv + 1, length.out = nSingle)
+            ## Each element of ltxChars should represent one character or
+            ## a space between words
+            strStart <- which(diff(c(0, charIdx)) > 0)
+            strStop <- c(strStart[-1] - 1, nChars)
+            ltxChars <- substring(ltxSingle[i], strStart, strStop)
+        } else {
+            ltxChars <- character(0)
+        }
+        ## stripChars: "Independently" do a part of what latexify()
+        ## does, i.e. remove control characters and use single spaces
+        ## only
+        stripChars <-
+            strsplit(gsub(multiSpaceClass,
+                          " ",
+                          gsub(controlClass,
+                               "",
+                               testStrings[i])),
+                     "")[[1]]
+        ## Compare ltxChars and stripChars
+        checkEqualsNumeric(length(stripChars), length(ltxChars),
+                           msg=sprintf("Number of characters correct (%.0f)",
+                           i),
+                           tolerance=0)
+        singleFlag <- nchar(ltxChars) == 1
+        checkTrue(!any(grepl(specialClass,
+                             ltxChars[singleFlag], useBytes=TRUE)),
+                  msg=sprintf("No specials left unescaped (%.0f)", i))
+        checkEquals(ltxChars[singleFlag], stripChars[singleFlag],
+                    msg=sprintf("Normal characters preserved (%.0f)", i))
+        specialMap[[i]] <- unique(cbind(stripChars[!singleFlag],
+                                        ltxChars[!singleFlag]))
+    }
+    ## specialMap becomes a combination of the unique rows across its
+    ## elements
+    specialMap <- do.call(rbind, specialMap)
+    specialMap <- unique(specialMap)
+    ## Check that special characters are mapped to LaTeX commands in a
+    ## consistent manner
+    checkEqualsNumeric(length(specialChars), nrow(specialMap),
+                       msg="Correct number of character mappings",
+                       tolerance=0)
+    checkTrue(all(specialChars %in% specialMap[, 1]),
+              msg="Each special character has a mapping")
+    ## A separate test for encoding conversion
+    latin1String <- "clich\xe9 ma\xf1ana"
+    Encoding(latin1String) <- "latin1"
+    utf8fy <- latexify(latin1String)
+    checkEquals("UTF-8", Encoding(utf8fy),
+                msg="Declared encoding is UTF-8")
+    checkEquals(as.raw(c(0x63, 0x6c, 0x69, 0x63, 0x68, 0xc3, 0xa9, 0x20,
+                         0x6d, 0x61, 0xc3, 0xb1, 0x61, 0x6e, 0x61)),
+                charToRaw(utf8fy),
+                msg="Conversion to UTF-8 NFC succeeded")
+}
+
+test.latexDate <- function() {
+    dates <- Sys.Date() + round(runif(100, min = -1000, max = 1000))
+    latexDates <- latexDate(dates)
+    checkEqualsNumeric(length(dates), length(latexDates),
+                       msg="Length of output equals length of input",
+                       tolerance=0)
+    splitDates <- strsplit(latexDates, ", ")
+    checkEqualsNumeric(rep(2, length(dates)),
+                       vapply(splitDates, length, numeric(1)),
+                       msg="Year at end, separated by comma and space",
+                       tolerance=0)
+    monthsDays <- vapply(splitDates, "[[", character(1), 1)
+    yearStr <- vapply(splitDates, "[[", character(1), 2)
+    Years <- suppressWarnings(as.numeric(yearStr))
+    checkTrue(all(is.finite(Years)), msg="Year is a number")
+    splitDates2 <- strsplit(monthsDays, " ")
+    checkEqualsNumeric(rep(2, length(dates)),
+                       vapply(splitDates2, length, numeric(1)),
+                       msg="Month and day separated by space",
+                       tolerance=0)
+    Months <- match(vapply(splitDates2, "[[", character(1), 1), month.name)
+    checkTrue(all(is.finite(Months)),
+              msg="Month names match to entries in month.name")
+    monthStr <- sprintf("%02.0f", Months)
+    Days <- suppressWarnings(as.numeric(vapply(splitDates2,
+                                               "[[", character(1), 2)))
+    checkTrue(all(is.finite(Days)), msg="Day of month is a number")
+    dayStr <- sprintf("%02.0f", Days)
+    checkEquals(as.character(dates),
+                paste(yearStr, monthStr, dayStr, sep="-"),
+                msg="latexDate(x) matches x in ISO format")
+}


Property changes on: pkg/dplR/inst/unitTests/runit.utils.R
___________________________________________________________________
Added: svn:eol-style
   + native

Modified: pkg/dplR/man/latexify.Rd
===================================================================
--- pkg/dplR/man/latexify.Rd	2014-11-11 09:14:55 UTC (rev 914)
+++ pkg/dplR/man/latexify.Rd	2014-11-17 21:01:12 UTC (rev 915)
@@ -22,10 +22,13 @@
 
   The function is intended for use with unformatted inline text.
   Newlines, tabs and other whitespace characters (\code{"[:space:]"} in
-  \link{regex}) are converted to spaces.  Control character
+  \link{regex}) are converted to spaces.  Control characters
   (\code{"[:cntrl:]"}) that are not whitespace are removed.  Other
-  special characters are {, }, \, #, $, \%, ^, &, _, ~, \" and /.  They
-  are converted to the corresponding LaTeX commands.
+  special characters are \sQuote{\{}, \sQuote{\}}, \sQuote{\\},
+  \sQuote{#}, \sQuote{$}, \sQuote{\%}, \sQuote{^}, \sQuote{&},
+  \sQuote{_}, \sQuote{~},
+  \sQuote{"}, and \sQuote{/}.
+  They are converted to the corresponding LaTeX commands.
 
   Before applying the substitutions described above, input elements with
   \code{Encoding} set to \code{"bytes"} are printed and the result is
@@ -35,12 +38,11 @@
   This set includes tabs, newlines and control characters.  The
   substitutions are then applied to the intermediate result.
 
-  Input elements with \code{"unknown"} encoding are assumed to be in the
-  current encoding.  These and \code{"latin1"} encoded elements are
-  converted to UTF-8.
+  The result is converted to UTF-8 encoding, Normalization Form C
+  (NFC).
 
-  Suggested package loading commands in the document preamble
-  are:\preformatted{\usepackage[T1]{fontenc}    \% required for "
+  Assuming that \samp{pdflatex} is used for compilation, suggested
+  package loading commands in the document preamble are: \preformatted{\usepackage[T1]{fontenc}    \% required for "
 \usepackage[utf8]{inputenx} \% UTF-8 input encoding
 \input{ix-utf8enc.dfu}      \% more supported characters} 
   
@@ -51,6 +53,8 @@
 \references{
   Pakin, S. (2009) The Comprehensive LaTeX Symbol
   List. \url{http://www.ctan.org/tex-archive/info/symbols/comprehensive}
+
+  The Unicode Consortium. The Unicode Standard. \url{http://www.unicode.org/versions/latest/}
 }
 \author{
   Mikko Korpela