[Dplr-commits] r921 - in pkg/dplR: . R inst/unitTests man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Fri Dec 12 19:55:17 CET 2014


Author: mvkorpel
Date: 2014-12-12 19:55:16 +0100 (Fri, 12 Dec 2014)
New Revision: 921

Modified:
   pkg/dplR/DESCRIPTION
   pkg/dplR/NAMESPACE
   pkg/dplR/R/latexify.R
   pkg/dplR/inst/unitTests/runit.utils.R
   pkg/dplR/man/latexify.Rd
Log:
latexify() supports a more comprehensive set of characters


Modified: pkg/dplR/DESCRIPTION
===================================================================
--- pkg/dplR/DESCRIPTION	2014-12-11 09:39:52 UTC (rev 920)
+++ pkg/dplR/DESCRIPTION	2014-12-12 18:55:16 UTC (rev 921)
@@ -3,7 +3,7 @@
 Type: Package
 Title: Dendrochronology Program Library in R
 Version: 1.6.1
-Date: 2014-12-11
+Date: 2014-12-12
 Authors at R: c(person("Andy", "Bunn", role = c("aut", "cph",
         "cre", "trl"), email = "andy.bunn at wwu.edu"), person("Mikko",
         "Korpela", role = c("aut", "trl")), person("Franco", "Biondi",

Modified: pkg/dplR/NAMESPACE
===================================================================
--- pkg/dplR/NAMESPACE	2014-12-11 09:39:52 UTC (rev 920)
+++ pkg/dplR/NAMESPACE	2014-12-12 18:55:16 UTC (rev 921)
@@ -22,7 +22,7 @@
 
 importFrom(png, readPNG)
 
-importFrom(stringi, stri_trans_nfc)
+importFrom(stringi, stri_trans_nfc, stri_trans_nfd)
 
 importFrom(stringr, str_pad, str_trim)
 

Modified: pkg/dplR/R/latexify.R
===================================================================
--- pkg/dplR/R/latexify.R	2014-12-11 09:39:52 UTC (rev 920)
+++ pkg/dplR/R/latexify.R	2014-12-12 18:55:16 UTC (rev 921)
@@ -13,15 +13,9 @@
 
 ## Usage: \Sexpr{latexify(string_produced_by_R_code)}
 ##
-## Make arbitrary character() vector compatible with LaTeX by escaping
-## special characters, then convert to UTF-8 encoding.  Any formatting
-## (newlines, tabs, etc.) will be lost.  Note that the set of
-## characters actually supported depends on the font, LaTeX engine and
-## set of packages used.
-##
 ## It seems that Sweave needs doublebackslash = TRUE
 ## but knitr needs doublebackslash = FALSE.
-latexify <- function(x, doublebackslash = TRUE,
+latexify <- function(x, doublebackslash = TRUE, dashdash = TRUE,
                      quotes = c("straight", "curved"),
                      packages = c("fontenc", "textcomp")) {
     y <- as.character(x)
@@ -39,8 +33,11 @@
         cat(y[encBytes], sep = "\n")
         y[encBytes] <- foo
     }
+    l10n <- l10n_info()
+    Letters <- paste0(c(LETTERS, letters), collapse="")
     fontenc <- "fontenc" %in% packages
     textcomp <- "textcomp" %in% packages
+    eurosym <- "eurosym" %in% packages
     straightQuotes <- match.arg(quotes) == "straight"
     ## Remove control characters (not spaces!)
     y <- gsub("(?![[:space:]])[[:cntrl:]]", "", y, perl=TRUE)
@@ -48,10 +45,14 @@
     ## substitution must be done before control characters because
     ## newline belongs to both groups.
     y <- gsub("[[:space:]]+", " ", y)
-    ## Escape LaTeX special characters.
+
+    ## Handle LaTeX special characters in the ASCII range.
     ## Some substitutions are mandatory, others affect matters such as
-    ## the rendering of the character in question (\textquote...) or
-    ## line breaks (\slash).
+    ## the rendering of the character in question (e.g. \textquote...)
+    ## or line breaks (\slash).  Some substitutions are unnecessary if
+    ## using a font encoding other than OT1 (\textbar, ...), but are
+    ## performed every time nonetheless.  The dash (-) is given
+    ## special treatment to disable the em and en dash ligatures.
     ## Source: Scott Pakin (2009) The Comprehensive LaTeX Symbol List.
     ## Accessible through "texdoc symbols".
     ## Particularly section 8.6 "ASCII and Latin 1 quick reference".
@@ -61,46 +62,290 @@
     ## Then, \ is replaced with \textbackslash{},
     ## but not if followed by { or }.
     ## After that, the order does not matter.
+    ##
+    ## This starts the 'substitutions' list, which is finally
+    ## processed close to the end of the function.
     substitutions <-
         list(c("([{}])", "\\\\\\1"),
-             c("\\\\(?!\\{|\\})", "\\\\textbackslash{}"),
+             c("\\\\(?![{}])", "\\\\textbackslash{}"),
              c("\\^", "\\\\textasciicircum{}"),
              c("~", "\\\\textasciitilde{}"),
+             c("<", "\\\\textless{}"),
+             c(">", "\\\\textgreater{}"),
+             c("\\|", "\\\\textbar{}"),
              c("([#$%&_])", "\\\\\\1"),
+             if (isTRUE(dashdash)) {
+                 c("-", "\\\\mbox{-}")
+             },
              if (textcomp && straightQuotes) {
                  c("'", "\\\\textquotesingle{}")
              },
+             if (textcomp && straightQuotes) {
+                 c("`", "\\\\textasciigrave{}")
+             },
              c('"', if (fontenc && straightQuotes) {
                  "\\\\textquotedbl{}"
              } else {
                  "\\\\textquotedblright{}"
              }),
              c("/", "\\\\slash{}"))
-    if (isTRUE(l10n_info()[["MBCS"]])) {
-        ## The output of sQuote() and dQuote() may contain
-        ## non-ASCII quoting characters.  If the input is ASCII,
-        ## it may be a surprise to the user that an UTF-8 input
-        ## encoding is then needed in LaTeX.  Converting the
-        ## quotes to commands solves this problem.
-        substitutions <-
-            c(substitutions,
-              list(c("\u2018", "\\\\textquoteleft{}"),
-                   c("\u2019", "\\\\textquoteright{}"),
-                   c("\u201c", "\\\\textquotedblleft{}"),
-                   c("\u201d", "\\\\textquotedblright{}")))
-    }
-    ## Remove empty group after command when followed by a backslash
-    Letters <- paste(c(LETTERS, letters), collapse="")
-    substitutions <- c(substitutions,
-                       list(c(sprintf("(\\\\[%s]+)\\{\\}(?=\\\\)",
-                                      Letters), "\\1")))
+    substitutions <- substitutions[!vapply(substitutions, is.null, logical(1))]
 
-    for (subst in substitutions[!vapply(substitutions, is.null, logical(1))]) {
+    ## Treatment of non-ASCII characters follows.
+
+    ## Digraphs and ligatures broken into their parts, except the
+    ## Dutch digraphs IJ and ij which have their own commands.
+    substitutions <-
+        c(substitutions,
+          list(c("\u0132", "\\\\IJ{}"),
+               c("\u0133", "\\\\ij{}"),
+               c("\u01f1", "DZ"),
+               c("\u01f2", "Dz"),
+               c("\u01f3", "dz"),
+               c("\u01c4", "DZ\u030c"),
+               c("\u01c5", "Dz\u030c"),
+               c("\u01c6", "dz\u030c"),
+               c("\u01c7", "LJ"),
+               c("\u01c8", "Lj"),
+               c("\u01c9", "lj"),
+               c("\u01ca", "NJ"),
+               c("\u01cb", "Nj"),
+               c("\u01cc", "nj"),
+               c("\ufb00", "ff"),
+               c("\ufb01", "fi"),
+               c("\ufb02", "fl"),
+               c("\ufb03", "ffi"),
+               c("\ufb04", "ffl"),
+               c("\ufb05", "\u017ft"),
+               c("\ufb06", "st")))
+
+    ## Accents (diacritics) above the letter (drop i and j dots)
+    above <- list(diaeresis   = c("\u0308", "\""),
+                  acute       = c("\u0301", "'"),
+                  dotabove    = c("\u0307", "."),
+                  macron      = c("\u0304", "="),
+                  circumflex  = c("\u0302", "^"),
+                  grave       = c("\u0300", "`"),
+                  tilde       = c("\u0303", "~"),
+                  doubleacute = c("\u030b", "H"),
+                  ringabove   = c("\u030a", "r"),
+                  breve       = c("\u0306", "u"),
+                  caron       = c("\u030c", "v"),
+                  invbreve    = c("\u0311", "newtie")) # textcomp
+    ## Accents that co-exist with i and j dots (mainly below the
+    ## letter, but also the ligature tie)
+    below <- list(macronbelow = c("\u0331", "b"),
+                  cedilla     = c("\u0327", "c"),
+                  dotbelow    = c("\u0323", "d"),
+                  tie         = c("\u0361", "t"),
+                  ogonek      = c("\u0328", "k")) # not in OT1 fontenc
+    accents <- c(above, below)
+    command <- paste0("\\\\[", Letters, "]+|\\\\.")
+    combining <- paste0(vapply(accents, "[", character(1), 1),
+                        collapse="")
+    accPre <- paste0("(", command, "|.)({})?(?<![", combining, "])")
+    accPost <- paste0("(?![", combining, "])")
+
+    ## Accent above the letter
+    aboveInCode <- vapply(above, "[", character(1), 1)
+    ijPattern <- paste0("([ij])", aboveInCode, accPost)
+    otherPattern <- paste0(accPre, aboveInCode, accPost)
+    aboveOutCode <- vapply(above, "[", character(1), 2)
+    ijReplacement <- paste0("\\\\", aboveOutCode, "{\\\\\\1}")
+    otherReplacement <- paste0("\\\\", aboveOutCode, "{\\1}")
+    ## for (accent in above) {
+    ##     code <- accent[1]
+    ##     replacement <- accent[2]
+    ##     y <- gsub(paste0("([ij])", code, accPost),
+    ##               paste0("\\\\", replacement, "{\\\\\\1}"), y, perl = TRUE)
+    ##     y <- gsub(paste0(accPre, code, accPost),
+    ##               paste0("\\\\", replacement, "{\\1}"), y, perl = TRUE)
+    ## }
+
+    ## Accent below the letter, and ligature tie
+    belowInCode <- vapply(below, "[", character(1), 1)
+    belowPattern <- paste0(accPre, belowInCode, accPost)
+    belowOutCode <- vapply(below, "[", character(1), 2)
+    belowReplacement <- paste0("\\\\", belowOutCode, "{\\1}")
+    ## for (accent in below) {
+    ##     code <- accent[1]
+    ##     replacement <- accent[2]
+    ##     y <- gsub(paste0(accPre, code, accPost),
+    ##               paste0("\\\\", replacement, "{\\1}"), y, perl = TRUE)
+    ## }
+
+    ## Combining an enclosing circle with an accent seems to work
+    circPre <- paste0("(", command, "({([^}]|\\\\})+})?|.)({})?")
+    circPattern <- paste0(circPre, "\u20dd", accPost)
+    circReplacement <- "\\\\textcircled{\\1}"
+
+    substitutions <-
+        c(substitutions,
+          lapply(lapply(mapply(list, list(as.name("c")),
+                               c(ijPattern, otherPattern,
+                                 belowPattern, circPattern),
+                               c(ijReplacement, otherReplacement,
+                                 belowReplacement, circReplacement),
+                               SIMPLIFY=FALSE), as.call), eval))
+
+    ## The output of sQuote() and dQuote() may contain non-ASCII
+    ## quoting characters.  If the input is ASCII, it may be a
+    ## surprise to the user that an UTF-8 input encoding is then
+    ## needed in LaTeX.  Converting the quotes to commands solves
+    ## this problem.  The substitution list also contains
+    ## non-ASCII letters and other unicode characters.
+    substitutions <-
+        c(substitutions,
+          list(c("\u00a1", "\\\\textexclamdown{}"),
+               c("\u00a3", "\\\\pounds{}"),
+               c("\u00a7", "\\\\S{}"),
+               c("\u00a9", "\\\\copyright{}"),
+               c("\u00aa", "\\\\textordfeminine{}"),
+               c("\u00ae", "\\\\textregistered{}"),
+               c("\u00b6", "\\\\P{}"),
+               c("\u00b7", "\\\\textperiodcentered{}"),
+               c("\u00ba", "\\\\textordmasculine{}"),
+               c("\u00bf", "\\\\textquestiondown{}"),
+               c("\u2013", "\\\\textendash{}"),
+               c("\u2014", "\\\\textemdash{}"),
+               c("\u2018", "\\\\textquoteleft{}"),
+               c("\u2019", "\\\\textquoteright{}"),
+               c("\u201c", "\\\\textquotedblleft{}"),
+               c("\u201d", "\\\\textquotedblright{}"),
+               c("\u2020", "\\\\dag{}"),
+               c("\u2021", "\\\\ddag{}"),
+               c("\u2022", "\\\\textbullet{}"),
+               c("\u2026", "\\\\dots{}"),
+               c("\u2122", "\\\\texttrademark{}"),
+               c("\u2423", "\\\\textvisiblespace{}"),
+               c("\u00c6", "\\\\AE{}"),
+               c("\u00e6", "\\\\ae{}"),
+               c("\u0152", "\\\\OE{}"),
+               c("\u0153", "\\\\oe{}"),
+               c("\u00d8", "\\\\O{}"),
+               c("\u00f8", "\\\\o{}"),
+               c("\u0141", "\\\\L{}"),
+               c("\u0142", "\\\\l{}"),
+               ## U+1E9E Latin capital letter sharp s. Works with
+               ## XeTeX and LuaTeX, provided that the character is
+               ## present in the font. Otherwise \SS, which usually
+               ## produces "SS".
+               c("\u1e9e", "\\\\ifdefined\\\\XeTeXrevision\\\\iffontchar\\\\font\"1E9E\\\\symbol{\"1E9E}\\\\else\\\\SS\\\\fi\\\\else\\\\ifdefined\\\\directlua\\\\iffontchar\\\\font\"1E9E\\\\symbol{\"1E9E}\\\\else\\\\SS\\\\fi\\\\else\\\\SS\\\\fi\\\\fi{}"),
+               c("\u00df", "\\\\ss{}"),
+               ## U+017F Latin small letter long s
+               c("\u017f", "\\\\ifdefined\\\\XeTeXrevision\\\\symbol{\"017F}\\\\else\\\\ifdefined\\\\directlua\\\\symbol{\"017F}\\\\else{\\\\fontencoding{TS1}\\\\selectfont s}\\\\fi\\\\fi{}")))
+    ## Other non-ASCII letters, punctuation marks.
+    ## These don't work with the OT1 font encoding.
+    substitutions <-
+        c(substitutions,
+          list(c("\u00d0", "\\\\DH{}"),
+               c("\u00f0", "\\\\dh{}"),
+               c("\u0110", "\\\\DJ{}"),
+               c("\u0111", "\\\\dj{}"),
+               c("\u014a", "\\\\NG{}"),
+               c("\u014b", "\\\\ng{}"),
+               c("\u00de", "\\\\TH{}"),
+               c("\u00fe", "\\\\th{}"),
+               c("\u00ab", "\\\\guillemotleft{}"),
+               c("\u00bb", "\\\\guillemotright{}"),
+               c("\u201a", "\\\\quotesinglbase{}"),
+               c("\u201e", "\\\\quotedblbase{}"),
+               c("\u2039", "\\\\guilsinglleft{}"),
+               c("\u203a", "\\\\guilsinglright{}")))
+    ## Miscellaneous, arrows, delimiters, legal symbols, science
+    ## and engineering, currencies, diacritics, text mode math.
+    ## These require textcomp.
+    substitutions <-
+        c(substitutions,
+          list(c("\u00a0", "~"),# no-break space (NBSP)
+               c("\u00ad", "\\\\-"),# soft hyphen (SHY)
+               c("\u200b", "\\\\hspace{0pt}"),# zero width space (ZWSP)
+               c("\u2217", "\\\\textasteriskcentered{}"),
+               c("\u2016", "\\\\textbardbl{}"),
+               c("\u25ef", "\\\\textbigcircle{}"),
+               c("\u2422", "\\\\textblank{}"),
+               c("\u00a6", "\\\\textbrokenbar{}"),
+               c("\u2052", "\\\\textdiscount{}"),
+               c("\u212e", "\\\\textestimated{}"),
+               c("\u203d", "\\\\textinterrobang{}"),
+               c("\u2e18", "\\\\textinterrobangdown{}"),
+               c("\u2116", "\\\\textnumero{}"),
+               c("\u25e6", "\\\\textopenbullet{}"),
+               c("\u2030", "\\\\textperthousand{}"),
+               c("\u2031", "\\\\textpertenthousand{}"),
+               c("\u211e", "\\\\textrecipe{}"),
+               c("\u203b", "\\\\textreferencemark{}"),
+               c("\u02f7", "\\\\texttildelow{}"),
+               c("\u2190", "\\\\textleftarrow{}"),
+               c("\u2191", "\\\\textuparrow{}"),
+               c("\u2192", "\\\\textrightarrow{}"),
+               c("\u2193", "\\\\textdownarrow{}"),
+               c("\u3008", "\\\\textlangle{}"),
+               c("\u3009", "\\\\textrangle{}"),
+               c("\u301a", "\\\\textlbrackdbl{}"),
+               c("\u301b", "\\\\textrbrackdbl{}"),
+               c("\u2045", "\\\\textlquill{}"),
+               c("\u2046", "\\\\textrquill{}"),
+               c("\u2117", "\\\\textcircledP{}"),
+               c("\u2120", "\\\\textservicemark{}"),
+               c("\u2103", "\\\\textcelsius{}"),
+               c("\u2127", "\\\\textmho{}"),
+               c("\u00b5", "\\\\textmu{}"),
+               c("\u03a9", "\\\\textohm{}"),
+               c("\u0e3f", "\\\\textbaht{}"),
+               c("\u00a2", "\\\\textcent{}"),
+               c("\u20a1", "\\\\textcolonmonetary{}"),
+               c("\u00a4", "\\\\textcurrency{}"),
+               c("\u20ab", "\\\\textdong{}"),
+               c("\u20ac", if (eurosym) "\\\\euro{}" else "\\\\texteuro{}"),
+               c("\u20b2", "\\\\textguarani{}"),
+               c("\u20a4", "\\\\textlira{}"),
+               c("\u20a6", "\\\\textnaira{}"),
+               c("\u20b1", "\\\\textpeso{}"),
+               c("\u20a9", "\\\\textwon{}"),
+               c("\u00a5", "\\\\textyen{}"),
+               c("\u02dd", "\\\\textacutedbl{}"),
+               c("\u00b4", "\\\\textasciiacute{}"),
+               c("\u00b8", "\\\\c{}"),
+               c("\u02d8", "\\\\textasciibreve{}"),
+               c("\u02c7", "\\\\textasciicaron{}"),
+               c("\u00a8", "\\\\textasciidieresis{}"),
+               c("\u00af", "\\\\textasciimacron{}"),
+               c("\u00b0", "\\\\textdegree{}"),
+               c("\u00f7", "\\\\textdiv{}"),
+               c("\u00bc", "\\\\textonequarter{}"),
+               c("\u00bd", "\\\\textonehalf{}"),
+               c("\u00be", "\\\\textthreequarters{}"),
+               c("\u00d7", "\\\\texttimes{}"),
+               c("\u00b1", "\\\\textpm{}"),
+               c("\u00b9", "\\\\textonesuperior{}"),
+               c("\u00b2", "\\\\texttwosuperior{}"),
+               c("\u00b3", "\\\\textthreesuperior{}"),
+               c("\u2044", "\\\\textfractionsolidus{}"),
+               c("\u221a", "\\\\textsurd{}"),
+               c("\u00ac", "\\\\textlnot{}"),
+               c("\u2212", "\\\\textminus{}")))
+
+    ## After a command, remove empty group (when at the end of the
+    ## string or some suitable character follows) or replace it with a
+    ## space (when a space does not follow).
+    tmp <- paste0("(\\\\[", Letters, "]+){}")
+    substitutions <-
+        c(substitutions,
+          list(c(paste0(tmp, "(?=$|[[:digit:],.?!;:\\\\}+*/-])"), "\\1"),
+               c(paste0(tmp, "(?! )"), "\\1 ")))
+
+    ## Convert strings to UTF-8 encoding, NFD (decomposed) form, for
+    ## processing of accented characters.
+    y <- stri_trans_nfd(y)
+    ## Apply the substitutions in the list
+    for (subst in substitutions) {
         y <- gsub(subst[1], subst[2], y, perl = TRUE)
     }
     if (isTRUE(doublebackslash)) {
         y <- gsub("\\", "\\\\", y, fixed=TRUE)
     }
-    ## Convert result to UTF-8 NFC encoding
+    ## Convert result to UTF-8 NFC encoding, although most non-ASCII
+    ## content has probably been converted to LaTeX commands.
     stri_trans_nfc(y)
 }

Modified: pkg/dplR/inst/unitTests/runit.utils.R
===================================================================
--- pkg/dplR/inst/unitTests/runit.utils.R	2014-12-11 09:39:52 UTC (rev 920)
+++ pkg/dplR/inst/unitTests/runit.utils.R	2014-12-12 18:55:16 UTC (rev 921)
@@ -28,7 +28,12 @@
               msg="IDs have a restricted character (4/16 choices) in one position")
 }
 
-test.latexify <- function(testDocument=FALSE) {
+## If 'testDocument' is TRUE, produces a test document to 'con', which may
+## be a connection or a filename.
+test.latexify <-
+    function(testDocument=FALSE,
+             con=tempfile(pattern = "latexify", fileext = ".tex"))
+{
     ## Number of test strings
     ## (including one "extra difficult" case and one empty string)
     SAMP.SIZE <- 50
@@ -39,16 +44,22 @@
     ## All ASCII characters except NUL (0)
     characters <- rawToChar(as.raw(1:127), multiple = TRUE)
     ## LaTeX special characters.  Some of these must be converted to
-    ## commands.  Others (single and double quote) are converted to
-    ## commands or other characters for improved compatibility with
-    ## other packages or to get a particular glyph (upright quote)
-    ## instead of the default (curved quote).
+    ## commands.  Others (e.g. single and double quote, *) are
+    ## converted to commands or other characters for improved
+    ## compatibility with other packages or to get a particular glyph
+    ## (upright quote, centered asterisk) instead of the default
+    ## (curved quote, asterisk located higher than the center of the
+    ## line).  Some characters would work fine in font encodings other
+    ## than OT1 (e.g. <, >, |), but are converted to commands anyway.
+    ## The dash is included to prevent -- and --- turning into an n
+    ## dash and an m dash, respectively.
     ##
     ## NOTE that the handling (what kind of treatment if any) of some
-    ## characters (currently single quote) depends on the (default)
+    ## characters (currently quotes) depends on the (default)
     ## arguments of latexify().
     specialChars <-
-        c("{", "}", "\\", "#", "$", "%", "^", "&", "_", "~", "\"", "/", "'")
+        c("{", "}", "\\", "#", "$", "%", "^", "&", "_", "~", "\"", "/", "'",
+          "<", ">", "|", "`", "-")
     specialStr <- paste(specialChars, collapse="")
     ## latexify() is designed to convert any sequence of space
     ## characters to a single regular space
@@ -96,12 +107,14 @@
     checkTrue(!any(grepl("\\\\", ltxSingle, fixed=TRUE)),
               msg="No line breaks (double backslash)")
     Letters <- paste(c(LETTERS, letters), collapse="")
-    textCommand <- sprintf("\\\\[%s]+", Letters)
-    commandAndGroup <- paste(textCommand, "(\\{\\}|(?=\\\\))", sep="")
-    commandsAt <- gregexpr(commandAndGroup, ltxSingle, perl=TRUE)
+    textCommand <- sprintf("\\\\[%s]+(\\{([^}]|\\\\})+})?", Letters)
+    commandTerminated <-
+        paste(textCommand,
+              "((?<=})|\\{\\}| +|(?=$|[[:digit:],.?!;:\\\\}+*/-]))", sep="")
+    commandsAt <- gregexpr(commandTerminated, ltxSingle, perl=TRUE)
     checkEquals(lapply(gregexpr(textCommand, ltxSingle), as.vector),
                 lapply(commandsAt, as.vector),
-                msg="Command name is followed by empty group or backslash")
+                msg="Command name is terminated properly")
     escape <- sprintf("\\\\[^%s](\\{\\})?", Letters)
     escapesAt <- gregexpr(escape, ltxSingle)
 
@@ -153,8 +166,8 @@
             ## a space between words
             strStart <- which(diff(c(0, charIdx)) > 0)
             strStop <- c(strStart[-1] - 1, nChars)
-            ## Strip off empty group following a command
-            ltxChars <- sub(sprintf("([%s])\\{\\}$", Letters), "\\1",
+            ## Strip off empty group or spaces following a command
+            ltxChars <- sub(sprintf("([%s])(\\{\\}| +)$", Letters), "\\1",
                             substring(ltxSingle[i], strStart, strStop))
         } else {
             ltxChars <- character(0)
@@ -194,16 +207,16 @@
                        tolerance=0)
     checkTrue(all(specialChars %in% specialMap[, 1]),
               msg="Each special character has a mapping")
-    ## A test for encoding conversion
+    ## A test for handling of different encodings in the input
     latin1String <- "clich\xe9 ma\xf1ana"
     Encoding(latin1String) <- "latin1"
-    utf8fy <- latexify(latin1String)
-    checkEquals("UTF-8", Encoding(utf8fy),
-                msg="Declared encoding is UTF-8")
-    checkEquals(as.raw(c(0x63, 0x6c, 0x69, 0x63, 0x68, 0xc3, 0xa9, 0x20,
-                         0x6d, 0x61, 0xc3, 0xb1, 0x61, 0x6e, 0x61)),
-                charToRaw(utf8fy),
-                msg="Conversion to UTF-8 NFC succeeded")
+    latinConverted <- latexify(latin1String, doublebackslash=FALSE)
+    checkEquals("clich\\'{e} ma\\~{n}ana",
+                latinConverted,
+                msg="Conversion of latin1 string succeeded")
+    checkEquals(latinConverted,
+                latexify(enc2utf8(latin1String), doublebackslash=FALSE),
+                msg="Encoding of the input does not matter")
     ## A test for other than default quoting options
     quoteString <- "\"It's five o'clock\", he said."
     res1 <- latexify(quoteString, doublebackslash=FALSE)
@@ -211,48 +224,123 @@
     res3 <- latexify(quoteString, packages=character(0), doublebackslash=FALSE)
     res4 <- latexify(quoteString, packages="fontenc", doublebackslash=FALSE)
     res5 <- latexify(quoteString, packages="textcomp", doublebackslash=FALSE)
-    exp2 <- gsub("\"", "\\\\textquotedblright{}", quoteString)
-    exp4 <- gsub("\"", "\\\\textquotedbl{}", quoteString)
-    exp5 <- gsub("'", "\\\\textquotesingle{}", exp2)
-    exp1 <- gsub("'", "\\\\textquotesingle{}", exp4)
+    exp2 <- sub("\"", "\\\\textquotedblright ", quoteString)
+    exp2 <- sub("\"", "\\\\textquotedblright", exp2)
+    exp4 <- sub("\"", "\\\\textquotedbl ", quoteString)
+    exp4 <- sub("\"", "\\\\textquotedbl", exp4)
+    exp5 <- gsub("'", "\\\\textquotesingle ", exp2)
+    exp1 <- gsub("'", "\\\\textquotesingle ", exp4)
     checkEquals(exp1, res1, msg="Default straight quotes")
     checkEquals(exp2, res2, msg="Curved quotes")
     checkEquals(res2, res3, msg="Fallback to curved quotes")
     checkEquals(exp4, res4, msg="Fallback to curved single quotes")
-    retVal <- checkEquals(exp5, res5, msg="Fallback to curved double quotes")
+    checkEquals(exp5, res5, msg="Fallback to curved double quotes")
     ## Check that non-ASCII quotes used by dQuote() and sQuote() are
     ## converted to LaTeX commands
-    if (isTRUE(l10n_info()[["MBCS"]])) {
-        nestQuotes <- paste0("You said, \u201cHe said, ",
-                             "\u2018Have a nice day.\u2019\u201d")
-        nq <- latexify(nestQuotes, doublebackslash=FALSE)
-        retVal <-
-            checkEquals(gsub("\\{\\}(?=\\\\)", "",
-                             gsub("\u2018", "\\\\textquoteleft{}",
-                                  gsub("\u2019", "\\\\textquoteright{}",
-                                       gsub("\u201c", "\\\\textquotedblleft{}",
-                                            gsub("\u201d",
-                                                 "\\\\textquotedblright{}",
-                                                 nestQuotes)))),
-                             perl=TRUE),
-                        nq)
-    }
+    nestQuotes <- paste0("You said, \u201cHe said, ",
+                         "\u2018Have a nice day.\u2019\u201d")
+    nq <- latexify(nestQuotes, doublebackslash=FALSE)
+    checkEquals(gsub("\\{\\}(?=\\\\)", "",
+                     gsub("\u2018", "\\\\textquoteleft ",
+                          gsub("\u2019", "\\\\textquoteright",
+                               gsub("\u201c", "\\\\textquotedblleft ",
+                                    gsub("\u201d",
+                                         "\\\\textquotedblright",
+                                         nestQuotes)))),
+                     perl=TRUE),
+                nq, msg="dQuote() and sQuote() are safe")
+    diaeresisD <- "o\u0308ljysa\u0308ilio\u0308"
+    diasD <- latexify(diaeresisD, doublebackslash=FALSE)
+    diaeresisC <- "\u00f6ljys\u00e4ili\u00f6"
+    diasC <- latexify(diaeresisC, doublebackslash=FALSE)
+    checkEquals(diasD, diasC, msg="Unicode NFD and NFC")
+    ## Strings containing practically every non-ASCII character that
+    ## will be converted by latexify()
+    breakWords <- rep(c("space", "vacation", "movie", "line", "break",
+                        "rope", "period"), 2)
+    allChars <-
+        c("\u0132sselmeer is a lake, Berl\u0133n is Dutch for Berlin.",
+          paste0("Other digraphs and ligatures: \u01f1, \u01f2, \u01f3, ",
+                 "\u01c4, \u01c5, \u01c6, \u01c7, \u01c8, \u01c9, \u01ca, ",
+                 "\u01cb, \u01cc, \ufb00, \ufb01, \ufb02, \ufb03, \ufb04, ",
+                 "\ufb05, \ufb06"),
+          paste0("\u017c\u00f3\u0142ty, p\u0113c, f\u00eate, ",
+                 "vis-\u00e0-vis, pi\u00f1ata, Erd\u0151s, ",
+                 "\u00c5ngstr\u00f6m, m\u0103r, t\u0159i, \u0203, t\u0331, ",
+                 "fa\u00e7ade, \u1e0c, ",
+                 "k\u0361p (there should be a ligature tie), t\u0105sa"),
+          "Circled: a\u20dd, \u00f1\u20dd",
+          "\u00a1Hola! \u00bfQu\u00e9 pasa? \u2e18Verdad\u203d",
+          paste0("Money: \u00a3, \u20ac, \u00a2, \u00a5, \u00a4, \u0e3f, ",
+                 "\u20a1, \u20ab, \u20b2, \u20a4, \u20a6, \u20b1, \u20a9"),
+          paste0("No-Break\u00a0", breakWords, collapse=" "),
+          paste0("Do-Break ", breakWords, collapse=" "),
+          "visible\u2423space",
+          paste0(rep("Zero\u200bWidth\u200bSpace\u200b", 10), collapse=""),
+          paste0(rep(paste0("S\u00ado\u00adf\u00adt\u00ad",
+                            "H\u00ady\u00adp\u00adh\u00ade\u00adn\u00ad",
+                            "E\u00adv\u00ade\u00adr\u00ady\u00ad",
+                            "w\u00adh\u00ade\u00adr\u00ade"), 8),
+                 collapse="\u00ad"),
+          "Legal \u00a7, \u00a9, \u00ae, \u00b6, \u2117, \u2120, \u2122",
+          paste0("Letters \u00c6, \u00e6, \u0152, \u0153, \u00d8, \u00f8, ",
+                 "\u0141, \u0142, \u1e9e, \u00df, \u017f, \u00d0, \u00f0, ",
+                 "\u0110, \u0111, \u014a, \u014b, \u00de, \u00fe"),
+          paste0("Quotes: \u00ab | \u00bb | \u201a | \u201e | \u2039 | \u203a",
+                 " | \u2018 | \u2019 | \u201c | \u201d"),
+          paste0("mho \u2127, ohm \u03a9, micro \u00b5, Celsius \u2103, ",
+                 "degree \u00b0"),
+          paste0("Division \u00f7, times \u00d7, fractions \u00bc, \u00bd, ",
+                 "\u00be, plusminus \u00b1, root \u221a, minus \u2212, ",
+                 "asterisk \u2217, not \u00ac, fraction solidus \u2044, ",
+                 "superscript \u00b9, \u00b2, \u00b3"),
+          paste0("Discount \u2052, estimated \u212e, \u2116 5, ",
+                 "per mille \u2030, parts per ten thousand \u2031"),
+          "Bullets: \u2022 Closed, \u25e6 open",
+          paste0("Spacing diacritics: double acute \u02dd, acute \u00b4, ",
+                 "cedilla \u00b8, breve \u02d8, ",
+                 "caron \u02c7, diaeresis \u00a8, macron \u00af"),
+          "Arrows: left \u2190, up \u2191, right \u2192, down \u2193",
+          "Delimiters: \u3008, \u3009, \u301a, \u301b, \u2045, \u2046",
+          paste0("Miscellaneous: feminine ordinal \u00aa, ",
+                 "masculine ordinal \u00ba, middle dot \u00b7, ",
+                 "daggers \u2020, \u2021, ellipsis \u2026, ",
+                 "double vertical line \u2016, large circle \u25ef, ",
+                 "blank symbol \u2422, broken bar \u00a6, recipe \u211e, ",
+                 "reference mark \u203b, low tilde \u02f7, ",
+                 "en dash \u2013, em dash \u2014"))
+    ac <- latexify(allChars, doublebackslash=FALSE)
+    retVal <- checkTrue(all(Encoding(ac) == "unknown"),
+                        msg("No non-ASCII characters left"))
     ## When used independently outside the test suite, the function
-    ## can create a test document
+    ## can create a test document, but only in a UTF-8 locale.
     if (isTRUE(testDocument) && isTRUE(l10n_info()[["UTF-8"]])) {
         preamble <- c("\\documentclass[a4paper]{article}",
+                      "\\usepackage{etoolbox}",
+                      "\\usepackage{ifluatex}",
+                      "\\usepackage{ifxetex}",
+                      "\\usepackage{parskip}",
+                      "\\usepackage{textcomp}",
+                      "\\ifbool{luatex}{",
+                      "\\usepackage{fontspec}",
+                      "}{",
+                      "\\ifbool{xetex}{",
+                      "\\usepackage{fontspec}",
+                      "}{",
                       "\\usepackage[T1]{fontenc}",
-                      "\\usepackage{textcomp}",
-                      "\\usepackage{parskip}",
-                      "\\usepackage[utf8]{inputenx}",
-                      "\\input{ix-utf8enc.dfu}")
-        id <- c(testStrings, latin1String, rep(quoteString, 5), nestQuotes)
-        extraInfo <- c(rep("", length(testStrings) + 1),
+                      "\\usepackage{lmodern}",
+                      "}}")
+        id <- c(testStrings, latin1String, rep(quoteString, 5), nestQuotes,
+                diaeresisD, diaeresisC, allChars)
+        extraInfo <- c(rep("", length(testStrings) + length(latin1String)),
                        paste0(" (", c("default", "curved", "no packages",
                                       "only fontenc", "only textcomp"), ")"),
-                       "")
+                       rep("", length(nestQuotes)),
+                       rep(" (Unicode NFD)", length(diaeresisD)),
+                       rep(" (Unicode NFC)", length(diaeresisC)),
+                       rep("", length(allChars)))
 
-        ## Record how R prints inputDescription
+        ## Record how R prints the elements in 'id'
         inputDescription <- character(length(id)) # dummy line
         tc <- textConnection("inputDescription", "w", local = TRUE)
         sink(tc)
@@ -265,9 +353,18 @@
         close(tc)
         on.exit()
 
-        allOutput <- c(ltxSingle, utf8fy, res1, res2, res3, res4, res5, nq)
-        filename <- tempfile(pattern = "latexify", fileext = ".tex")
-        co <- file(filename, open = "wt", encoding = "UTF-8")
+        allOutput <- c(ltxSingle, latinConverted, res1, res2, res3, res4,
+                       res5, nq, diasD, diasC, ac)
+        if (is.character(con)) {
+            co <- file(con, open = "wt", encoding = "UTF-8")
+            on.exit(close(co))
+        } else {
+            co <- con
+            if (!isOpen(co)) {
+                open(co, open = "wt")
+                on.exit(close(co))
+            }
+        }
         writeLines(preamble, co)
         writeLines("\\begin{document}", co)
         writeLines("\\begin{enumerate}", co)
@@ -277,8 +374,7 @@
                    co, sep = "\n\n")
         writeLines("\\end{enumerate}", co)
         writeLines("\\end{document}", co)
-        close(co)
-        filename
+        con
     } else {
         retVal
     }

[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/dplr -r 921


More information about the Dplr-commits mailing list