<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><meta http-equiv=Content-Type content="text/html; charset=utf-8"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><!--[if !mso]><style>v\:* {behavior:url(#default#VML);}
o\:* {behavior:url(#default#VML);}
w\:* {behavior:url(#default#VML);}
.shape {behavior:url(#default#VML);}
</style><![endif]--><style><!--
/* Font Definitions */
@font-face
        {font-family:Helvetica;
        panose-1:2 11 6 4 2 2 2 2 2 4;}
@font-face
        {font-family:"Cambria Math";
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:Consolas;
        panose-1:2 11 6 9 2 2 4 3 2 4;}
@font-face
        {font-family:"Lucida Console";
        panose-1:2 11 6 9 4 5 4 2 2 4;}
@font-face
        {font-family:Monaco;
        panose-1:0 0 0 0 0 0 0 0 0 0;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0in;
        margin-bottom:.0001pt;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";}
h1
        {mso-style-priority:9;
        mso-style-link:"Heading 1 Char";
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:0in;
        font-size:28.0pt;
        font-family:"Times New Roman","serif";
        color:black;
        font-weight:bold;}
h2
        {mso-style-priority:9;
        mso-style-link:"Heading 2 Char";
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:0in;
        border:none;
        padding:0in;
        font-size:18.0pt;
        font-family:"Times New Roman","serif";
        color:black;
        font-weight:bold;}
h3
        {mso-style-priority:9;
        mso-style-link:"Heading 3 Char";
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:0in;
        font-size:13.5pt;
        font-family:"Times New Roman","serif";
        font-weight:bold;}
h4
        {mso-style-priority:9;
        mso-style-link:"Heading 4 Char";
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:0in;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";
        font-weight:bold;}
h5
        {mso-style-priority:9;
        mso-style-link:"Heading 5 Char";
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:0in;
        font-size:10.5pt;
        font-family:"Times New Roman","serif";
        font-weight:bold;}
h6
        {mso-style-priority:9;
        mso-style-link:"Heading 6 Char";
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:0in;
        font-size:10.5pt;
        font-family:"Times New Roman","serif";
        color:#777777;
        font-weight:bold;}
a:link, span.MsoHyperlink
        {mso-style-priority:99;
        color:#4183C4;
        text-decoration:none none;}
a:visited, span.MsoHyperlinkFollowed
        {mso-style-priority:99;
        color:#4183C4;
        text-decoration:none none;}
p
        {mso-style-priority:99;
        margin-top:11.25pt;
        margin-right:0in;
        margin-bottom:11.25pt;
        margin-left:0in;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";}
code
        {mso-style-priority:99;
        font-family:"Monaco","serif";
        border:solid #EAEAEA 1.0pt;
        padding:0in;
        background:#F8F8F8;}
pre
        {mso-style-priority:99;
        mso-style-link:"HTML Preformatted Char";
        margin-top:11.25pt;
        margin-right:0in;
        margin-bottom:11.25pt;
        margin-left:0in;
        background:#F8F8F8;
        border:none;
        padding:0in;
        font-size:10.0pt;
        font-family:"Monaco","serif";}
span.HTMLPreformattedChar
        {mso-style-name:"HTML Preformatted Char";
        mso-style-priority:99;
        mso-style-link:"HTML Preformatted";
        font-family:Consolas;}
span.Heading1Char
        {mso-style-name:"Heading 1 Char";
        mso-style-priority:9;
        mso-style-link:"Heading 1";
        font-family:"Calibri Light","sans-serif";
        color:#2E74B5;}
span.Heading2Char
        {mso-style-name:"Heading 2 Char";
        mso-style-priority:9;
        mso-style-link:"Heading 2";
        font-family:"Calibri Light","sans-serif";
        color:#2E74B5;}
span.Heading3Char
        {mso-style-name:"Heading 3 Char";
        mso-style-priority:9;
        mso-style-link:"Heading 3";
        font-family:"Calibri Light","sans-serif";
        color:#1F4D78;}
span.Heading4Char
        {mso-style-name:"Heading 4 Char";
        mso-style-priority:9;
        mso-style-link:"Heading 4";
        font-family:"Calibri Light","sans-serif";
        color:#2E74B5;
        font-style:italic;}
span.Heading5Char
        {mso-style-name:"Heading 5 Char";
        mso-style-priority:9;
        mso-style-link:"Heading 5";
        font-family:"Calibri Light","sans-serif";
        color:#2E74B5;}
span.Heading6Char
        {mso-style-name:"Heading 6 Char";
        mso-style-priority:9;
        mso-style-link:"Heading 6";
        font-family:"Calibri Light","sans-serif";
        color:#1F4D78;}
p.send, li.send, div.send
        {mso-style-name:send;
        margin-top:11.25pt;
        margin-right:0in;
        margin-bottom:11.25pt;
        margin-left:0in;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";
        color:#77BB77;}
p.server, li.server, div.server
        {mso-style-name:server;
        margin-top:11.25pt;
        margin-right:0in;
        margin-bottom:11.25pt;
        margin-left:0in;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";
        color:#7799BB;}
p.error, li.error, div.error
        {mso-style-name:error;
        margin-top:11.25pt;
        margin-right:0in;
        margin-bottom:11.25pt;
        margin-left:0in;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";
        color:#AA0000;}
span.EmailStyle32
        {mso-style-type:personal-reply;
        font-family:"Calibri","sans-serif";
        color:#1F497D;}
.MsoChpDefault
        {mso-style-type:export-only;
        font-size:10.0pt;}
@page WordSection1
        {size:8.5in 11.0in;
        margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
        {page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]--></head><body lang=EN-US link="#4183C4" vlink="#4183C4"><div class=WordSection1><p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>Thanks, that very helpful.<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'><o:p> </o:p></span></p><div><div style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in'><p class=MsoNormal><b><span style='font-size:11.0pt;font-family:"Calibri","sans-serif"'>From:</span></b><span style='font-size:11.0pt;font-family:"Calibri","sans-serif"'> Arunkumar Srinivasan [mailto:aragorn168b@gmail.com] <br><b>Sent:</b> Friday, June 13, 2014 10:46 PM<br><b>To:</b> Ron Hylton; datatable-help@lists.r-forge.r-project.org<br><b>Subject:</b> Re: [datatable-help] data.table is asking for help<o:p></o:p></span></p></div></div><p class=MsoNormal><o:p> </o:p></p><p>Sorry. But we can simplify it even further:<o:p></o:p></p><p>The first step is just <code><span style='font-size:10.0pt'>unique(test)</span></code>. So, we can do:<o:p></o:p></p><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8'><pre><code>system.time({<o:p></o:p></code></pre><pre><code>ans = unique(test)<o:p></o:p></code></pre><pre><code>ans = ans[ans[, .I[.N > 1L], by=id]$V1]<o:p></o:p></code></pre><pre><code>})<o:p></o:p></code></pre><pre><code>#  0.016   0.000   0.016  <o:p></o:p></code></pre></div><p>Identical?<o:p></o:p></p><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8'><pre><code>setkey(ans)<o:p></o:p></code></pre><pre><code>setkey(ut1)<o:p></o:p></code></pre><pre><code>identical(ans, ut1) # [1] TRUE<o:p></o:p></code></pre></div><div id="bloop_customfont"><p class=MsoNormal><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'><o:p> </o:p></span></p></div><div id="bloop_sign_1402713822374628096"><div><p class=MsoNormal><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>Arun<o:p></o:p></span></p></div></div><div><p class=MsoNormal><span style='color:black'><br>From: Arunkumar Srinivasan <a href="mailto:aragorn168b@gmail.com">aragorn168b@gmail.com</a><br>Reply: Arunkumar Srinivasan <a href="mailto:aragorn168b@gmail.com">aragorn168b@gmail.com</a><br>Date: June 14, 2014 at 4:42:31 AM<br>To: Ron Hylton <a href="mailto:rhylton@verizon.net">rhylton@verizon.net</a>, <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a> <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br>Subject:  Re: [datatable-help] data.table is asking for help <o:p></o:p></span></p></div><p class=MsoNormal><br><br><o:p></o:p></p><blockquote style='margin-left:0in;margin-top:11.25pt;margin-right:0in;margin-bottom:11.25pt'><div><div><p>A slightly simpler version of the 2nd solution is:<o:p></o:p></p><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8;margin-left:1.5pt;margin-right:1.5pt'><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>system.time({<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>ans = test[, .N, by=names(test)]<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>ans = ans[ans[, .I[.N > 1L], by=id]$V1]<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>})<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>#  0.019   0.000   0.019   <o:p></o:p></code></pre></div><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8'><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><o:p> </o:p></pre></div><p>The answers are identical, you can check this by doing:<o:p></o:p></p><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8;margin-left:1.5pt;margin-right:1.5pt'><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>ans[, N := NULL]<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>setkey(ans)<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>setkey(ut1)<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>identical(ans, ut1) # [1] TRUE<o:p></o:p></code></pre></div><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8'><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><o:p> </o:p></pre></div><div id="bloop_customfont"><p class=MsoNormal><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'><o:p> </o:p></span></p></div><div id="bloop_sign_1402713543682700032"><div><p class=MsoNormal><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>Arun<o:p></o:p></span></p></div></div><div><p class=MsoNormal><span style='color:black'><br>From: Arunkumar Srinivasan <a href="mailto:aragorn168b@gmail.com">aragorn168b@gmail.com</a><br>Reply: Arunkumar Srinivasan <a href="mailto:aragorn168b@gmail.com">aragorn168b@gmail.com</a><br>Date: June 14, 2014 at 4:34:15 AM<br>To: Ron Hylton <a href="mailto:rhylton@verizon.net">rhylton@verizon.net</a>, <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a> <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br>Subject:  Re: [datatable-help] data.table is asking for help<o:p></o:p></span></p></div><p class=MsoNormal><br><br><o:p></o:p></p><blockquote style='margin-left:0in;margin-top:11.25pt;margin-right:0in;margin-bottom:11.25pt'><div><div><p style='margin:0in;margin-bottom:.0001pt'>The j-expression is evaluated from within C for each group (unless they’re optimised with GForce - a new initiative in data.table). And <code><span style='font-size:10.0pt'>eval(.SD)</span></code> or <code><span style='font-size:10.0pt'>eval(anything(.SD))</span></code> is costly.<o:p></o:p></p><p style='margin:0in;margin-bottom:.0001pt'>You can get around it by listing the columns by yourself and using <code><span style='font-size:10.0pt'>.I</span></code> instead, as follows:<o:p></o:p></p><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8;margin-left:1.5pt;margin-right:1.5pt'><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>test[test[, .I[length(unique(list(x1,x2,x3))[[1L]]) > 1L], by=id]$V1]<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>#  0.140   0.001   0.142    <o:p></o:p></code></pre></div><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8'><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><o:p> </o:p></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><o:p> </o:p></pre></div><p>Takes about 0.14 seconds.<o:p></o:p></p><div class=MsoNormal align=center style='text-align:center'><hr size=3 width="100%" noshade style='color:#CCCCCC' align=center></div><p>An even faster way is:<o:p></o:p></p><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8;margin-left:1.5pt;margin-right:1.5pt'><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>system.time({<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>ans = test[test[, .I[.N > 1], by=id]$V1]        # (1)    <o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>ans = ans[, .N, by=names(ans)]                  # (2)    <o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>ans = ans[ans[, .I[.N > 1L], by=id]$V1]         # (3)<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>})<o:p></o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code><o:p> </o:p></code></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>#  0.026   0.000   0.027    <o:p></o:p></code></pre></div><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8'><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><o:p> </o:p></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><o:p> </o:p></pre></div><p>The idea for the second case is:<o:p></o:p></p><p style='margin:0in;margin-bottom:.0001pt'>(1) remove all entries where there’s just 1 row corresponding to that <code><span style='font-size:10.0pt'>id</span></code>.<br>(2) Aggregate this result by all the columns now and get the number of rows in the column <code><span style='font-size:10.0pt'>N</span></code> (we won’t have to use this column though).<br>(3) Now, if we aggregate by <code><span style='font-size:10.0pt'>id</span></code> and if any id has just 1 row, then it’d mean that that <code><span style='font-size:10.0pt'>id</span></code> has had more than 1 rows (step (1) filtering ensures this), but all of them are same and we don’t need them. So we just filter for those where .N > 1L.<o:p></o:p></p><p>HTH<o:p></o:p></p><div id="bloop_customfont"><p class=MsoNormal><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'><o:p> </o:p></span></p></div><div id="bloop_sign_1402709866978106112"><div><p class=MsoNormal><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>Arun<o:p></o:p></span></p></div></div><div><p class=MsoNormal><span style='color:black'><br>From: Ron Hylton <a href="mailto:rhylton@verizon.net">rhylton@verizon.net</a><br>Reply: Ron Hylton <a href="mailto:rhylton@verizon.net">rhylton@verizon.net</a><br>Date: June 14, 2014 at 3:30:55 AM<br>To: <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a> <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br>Subject:  Re: [datatable-help] data.table is asking for help<o:p></o:p></span></p></div><p class=MsoNormal><br><br><o:p></o:p></p><blockquote style='margin-left:0in;margin-top:11.25pt;margin-right:0in;margin-bottom:11.25pt'><div><div><div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>The performance is what puzzles me; the results are correct so the warnings don’t matter, and not all the variations I’ve tried have warnings.  On the real dataset (~800,000 rows) datatable takes about 1.5 times longer than dataframe + ddply.  I expected it to be substantially faster.</span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'> </span><o:p></o:p></p><div><div style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in'><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><b><span style='font-size:11.0pt;font-family:"Calibri","sans-serif"'>From:</span></b> <span style='font-size:11.0pt;font-family:"Calibri","sans-serif"'>Arunkumar Srinivasan [<a href="mailto:aragorn168b@gmail.com">mailto:aragorn168b@gmail.com</a>]<br><b>Sent:</b> Friday, June 13, 2014 8:57 PM<br><b>To:</b> Ron Hylton; <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br><b>Subject:</b> Re: [datatable-help] data.table is asking for help</span><o:p></o:p></p></div></div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><div id="bloop_customfont"><blockquote style='margin-left:0in;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt'><div><div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>However there’s another aspect.  While I’m relatively new to R my understanding is that a function argument should be modifiable within the function body without affecting the caller, which perhaps conflicts with the behavior of .SD.</span><o:p></o:p></p></div></div></blockquote><div><div><div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-family:"Helvetica","sans-serif"'>`data.table` is designed for working with *really large* data sets in mind (> 100 or 200 GB in memory even). And therefore, as a design feature, it trades in "referential transparency" for manipulating data objects *as efficient as possible* in terms of both *speed* and *memory usage* (most of the times they go hand-in-hand).</span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-family:"Helvetica","sans-serif"'>This is perhaps the biggest design choice one needs to be aware of when working/choosing data.tables. It is possible to modify objects by reference using data.table - All the functions that begin with "set*" modify objects by reference. The only other non "set*" function is `:=` operator.</span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-family:"Helvetica","sans-serif"'>HTH</span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif";color:black'>Arun</span><o:p></o:p></p></div></div></div></div><div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif";color:black'><br>From: Ron Hylton <a href="mailto:rhylton@verizon.net">rhylton@verizon.net</a><br>Reply: Ron Hylton <a href="mailto:rhylton@verizon.net">rhylton@verizon.net</a><br>Date: June 14, 2014 at 2:52:04 AM<br>To: <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a> <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br>Subject:  Re: [datatable-help] data.table is asking for help</span><o:p></o:p></p></div><p class=MsoNormal style='mso-margin-top-alt:auto;margin-bottom:12.0pt'><o:p> </o:p></p><blockquote style='margin-left:0in;margin-top:5.0pt;margin-right:0in;margin-bottom:5.0pt'><div><div><div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>I suspected it was something like this.  As one clarification, there is a setkey(test,id) before any setkey(.SD).   If setkey(test,id) is changed to setkey(test) so all columns are in the original datatable key then the warning goes away.</span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'> </span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>However there’s another aspect.  While I’m relatively new to R my understanding is that a function argument should be modifiable within the function body without affecting the caller, which perhaps conflicts with the behavior of .SD.</span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'> </span><o:p></o:p></p><div><div style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in'><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><b><span style='font-size:11.0pt;font-family:"Calibri","sans-serif"'>From:</span></b> <span style='font-size:11.0pt;font-family:"Calibri","sans-serif"'>Arunkumar Srinivasan [<a href="mailto:aragorn168b@gmail.com">mailto:aragorn168b@gmail.com</a>]<br><b>Sent:</b> Friday, June 13, 2014 8:23 PM<br><b>To:</b> Ron Hylton; <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br><b>Subject:</b> Re: [datatable-help] data.table is asking for help</span><o:p></o:p></p></div></div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>Nicely reproducible post. Reproducible in v1.9.3 (latest commit) as well.</span><o:p></o:p></p><p style='margin:0in;margin-bottom:.0001pt'><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>This is a tricky one. It happens because you’re setting key on</span> <code><span style='font-size:10.0pt'>.SD</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>which should normally not be allowed. What happens is, when you set key the first time, there’s no key set (here) and therefore key is set on all the columns</span> <code><span style='font-size:10.0pt'>x1</span></code><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>,</span> <code><span style='font-size:10.0pt'>x2</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>and</span> <code><span style='font-size:10.0pt'>x3</span></code><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>.</span><o:p></o:p></p><p style='margin:0in;margin-bottom:.0001pt'><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>Now, the next group (in the</span> <code><span style='font-size:10.0pt'>by=.</span></code><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>) is passed to your function, it’ll have the</span> <code><span style='font-size:10.0pt'>key</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>already set to</span> <code><span style='font-size:10.0pt'>x1,x2,x3</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>(because</span> <code><span style='font-size:10.0pt'>setkey</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>modifies the object by reference), but</span> <code><span style='font-size:10.0pt'>.SD</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>has obtained <strong><span style='font-family:"Helvetica","sans-serif"'>new</span></strong> data corresponding to <em><span style='font-family:"Helvetica","sans-serif"'>this</span></em> group. And</span> <code><span style='font-size:10.0pt'>data.table</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>sorts this data, knowing that it already has key set.. but if the key is set then the order must be 1:n. But it wouldn’t be, as this data isn’t sorted.</span> <code><span style='font-size:10.0pt'>data.table</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>warns in those scenarios.. and that’s why you get the warning.</span><o:p></o:p></p><p><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>To verify this, you can try:</span><o:p></o:p></p><div style='border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt'><div style='mso-element:para-border-div;border:solid #CCCCCC 1.0pt;padding:3.0pt 6.0pt 3.0pt 6.0pt;background:#F8F8F8'><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>conflictsTable1 <- function(f, address) {</code><o:p></o:p></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>  u <- unique(setkey(f))</code><o:p></o:p></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>  setattr(f, 'sorted', NULL)</code><o:p></o:p></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>  if (nrow(u) == 1) return(NULL)</code><o:p></o:p></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>  u</code><o:p></o:p></pre><pre style='margin:0in;margin-bottom:.0001pt;background:#F8F8F8'><code>}</code><o:p></o:p></pre></div></div><p style='margin:0in;margin-bottom:.0001pt'><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>Basically, we set the key of</span> <code><span style='font-size:10.0pt'>f</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>(which is equal to</span> <code><span style='font-size:10.0pt'>.SD</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>as it’s only modified by reference) to</span> <code><span style='font-size:10.0pt'>NULL</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>everytime after.. so that</span> <code><span style='font-size:10.0pt'>.SD</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>for the new group will not have the key set.</span><o:p></o:p></p><p style='margin:0in;margin-bottom:.0001pt'><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>The ideal scenario here, IIUC, is that</span> <code><span style='font-size:10.0pt'>setkey(.SD)</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>or things pointing to</span> <code><span style='font-size:10.0pt'>.SD</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>should not be possible (locking binding doesn’t seem to affect things done by reference..).</span> <code><span style='font-size:10.0pt'>.SD</span></code> <span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>however should retain the key of the data.table, if a key was set, wherever possible.</span><o:p></o:p></p><div id="bloop_customfont"><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'> </span><o:p></o:p></p></div><div id="bloop_sign_1402704505278157056"><div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>Arun</span><o:p></o:p></p></div></div><div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='color:black'><br>From: Ron Hylton <a href="mailto:rhylton@verizon.net">rhylton@verizon.net</a><br>Reply: Ron Hylton <a href="mailto:rhylton@verizon.net">rhylton@verizon.net</a><br>Date: June 14, 2014 at 1:55:53 AM<br>To: <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a> <a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br>Subject:  [datatable-help] data.table is asking for help</span><o:p></o:p></p></div><p class=MsoNormal style='mso-margin-top-alt:auto;margin-bottom:12.0pt'> <o:p></o:p></p><blockquote style='margin-left:0in;margin-top:11.25pt;margin-right:0in;margin-bottom:11.25pt'><div><div><div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>The code below generates the warning:<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;word-break:break-all'><span style='font-size:10.0pt;font-family:"Lucida Console";color:black;background:#E1E2E5'>In setkeyv(x, cols, verbose = verbose) :</span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;word-break:break-all'><span style='font-size:10.0pt;font-family:"Lucida Console";color:black;background:#E1E2E5'>  Already keyed by this key but had invalid row order, key rebuilt. If you didn't go under the hood please let datatable-help know so the root cause can be fixed.</span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;word-break:break-all'><span style='font-size:10.0pt;font-family:"Lucida Console";color:black;background:#E1E2E5'> </span><o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>This is my first attempt at using datatable so I probably did something dumb, but maybe that‘s useful for someone.  The first case is the one that gives the warnings.<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>I’m also surprised at the timings.  I wrote the original algorithm using dataframe & ddply and I expected datatable to be substantially faster; the opposite is true.<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>The algorithm does the following:  Certain columns in the table are keys and others are values in the sense that each row with the same set of keys should have the same set of values.  Find all the key sets for which this is not true and return the keys sets + conflicting value sets.<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>Insight into the performance would be appreciated.<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>Regards,<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>Ron<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>library(data.table)<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>library(plyr)<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>conflictsTable1 <- function(f) {<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>  u <- unique(setkey(f))<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>  if (nrow(u) == 1) return(NULL)<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>  u<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>}<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>conflictsTable2 <- function(f) {<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>  u <- unique(f)<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>  if (nrow(u) == 1) return(NULL)<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>  u<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>}<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>conflictsFrame <- function(f) {<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>  u <- unique(f)<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>  if (nrow(u) == 1) return(NULL)<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>  u<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>}<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>N <- 10000<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>test <- data.table(id=as.character(10000*sample(1:N,N,replace=TRUE)), x1=rnorm(N), x2=rnorm(N), x3=rnorm(N))<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>setkey(test,id)<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>print(system.time(ut1 <- test[, conflictsTable1(.SD), by=id]))<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>print(system.time(ut2 <- test[, conflictsTable2(.SD), by=id]))<o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'> <o:p></o:p></p><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>print(system.time(uf <- ddply(test, .(id), conflictsFrame)))<o:p></o:p></p></div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'>_______________________________________________<br>datatable-help mailing list<br><a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br><a href="https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/datatable-help">https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/datatable-help</a><o:p></o:p></p></div></div></blockquote></div><p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto'><span style='font-size:10.0pt;font-family:"Helvetica","sans-serif"'>_______________________________________________<br>datatable-help mailing list<br><a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br><a href="https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/datatable-help">https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/datatable-help</a></span><o:p></o:p></p></div></div></blockquote></div><p class=MsoNormal>_______________________________________________<br>datatable-help mailing list<br><a href="mailto:datatable-help@lists.r-forge.r-project.org">datatable-help@lists.r-forge.r-project.org</a><br><a href="https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/datatable-help">https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/datatable-help</a><o:p></o:p></p></div></div></blockquote></div></div></blockquote></div></div></blockquote></div></body></html>