[Genabel-commits] r1685 - in pkg/GenABEL-general: . create_data_scripts

Mon Apr 7 22:34:25 CEST 2014

Author: lckarssen
Date: 2014-04-07 22:34:25 +0200 (Mon, 07 Apr 2014)
New Revision: 1685

Added:
   pkg/GenABEL-general/create_data_scripts/
   pkg/GenABEL-general/create_data_scripts/Makefile
   pkg/GenABEL-general/create_data_scripts/README
   pkg/GenABEL-general/create_data_scripts/convert2databel.R
   pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh
   pkg/GenABEL-general/create_data_scripts/mach1.mldose
   pkg/GenABEL-general/create_data_scripts/mach1.out.mlinfo
Log:
This commit adds a directory with a script that creates a large data set, e.g. for profiling of ProbABEL or DatABEL.

See the README file for more details.


Added: pkg/GenABEL-general/create_data_scripts/Makefile
===================================================================

--- pkg/GenABEL-general/create_data_scripts/Makefile	                        (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/Makefile	2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,9 @@
+phenofiles = bt.PHE cox.PHE qt.PHE
+machfiles  = mach1.mldose.large mach1.out.large.mlinfo
+fvfiles    = mach1.dose.large.fvd mach1.dose.large.fvi
+invsigfile = invsigma.dat
+
+.PHONY: clean
+
+clean:
+	$(RM) $(phenofiles) $(machfiles) $(fvfiles) $(invsigfile)

Added: pkg/GenABEL-general/create_data_scripts/README
===================================================================
--- pkg/GenABEL-general/create_data_scripts/README	                        (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/README	2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,33 @@
+The enlarge_data_set.sh script uses the provided mach1.mldose and
+mach1.out.mlinfo files to create large data sets. These can then be
+used for testing purposes.
+
+These two provided files contain data for 100 individuals and 5000
+SNPs that are copied m_id and m_SNP times by the script. NOTE: the
+mach1.mldose file is not a proper MaCH output file. It doesn't contain
+the signature -> and MLDOSE.
+
+By default the mutiplication factors m_id and m_SNP are set to 5. The
+can be overridden by the command line options -i and -s,
+respectively.
+
+The other command line options are:
+   -d   to convert the enlarged genotype file to DatABEL/filevector files
+        as well. NOTE: this requires R and the GenABEL and DatABEL
+        packages to be installed.
+   -m   to create an inverse sigma matrix for use with the mmscore
+        option of ProbABEL.
+
+
+The script also creates a set of phenotype files for each of the three
+ProbABEL binaries (palinear, palogist and pacoxph).
+
+The Makefile is only used for cleaning up (i.e. removal of the created
+files). Consequently, it only has one target: clean.
+
+
+TODO:
+- insert random NAs in the phenotype files
+- insert random NAs in the genotype file
+- add covariates to the phenotype files
+- add some error checking when parsing the command line options

Added: pkg/GenABEL-general/create_data_scripts/convert2databel.R
===================================================================
--- pkg/GenABEL-general/create_data_scripts/convert2databel.R	                        (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/convert2databel.R	2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,23 @@
+## Convert dose files to DatABEL format
+##
+## Time-stamp: <2014-03-14 23:14:30 (L.C. Karssen)>
+##
+## This script requires three command line arguments in the following
+## order:
+## - the name of the info file
+## - the name of the dosage file (MaCH format)
+## - the name of the output file
+##
+## Run the script like this:
+##    R --vanilla -q -f convert2databel.R --args file.info file.dose file.out
+
+library(DatABEL)
+library(GenABEL)
+
+args <- commandArgs(TRUE)
+infofile <- args[1]
+dosefile <- args[2]
+outfile  <- args[3]
+
+mach2databel(dosefile, infofile, outfile, isprobfile = FALSE,
+             dataOutType = "FLOAT")

Added: pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh
===================================================================
--- pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh	                        (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh	2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,191 @@
+#!/bin/bash
+#
+# Time-stamp: <2014-04-07 22:19:09 (L.C. Karssen)>
+#
+# This script takes the GenABEL tutorial example data set for imputed
+# data and enlarges it to make it a bit more like a real analysis.
+
+
+## Parse command line options
+SNPmultfact=5
+IDmultfact=5
+makeDA=false
+invsigma=false
+while [ $# -gt 0 ]; do
+    case $1 in
+        -s | --snpfact )
+            SNPmultfact=$2
+            shift
+            ;;
+        -i | --idfactor )
+            IDmultfact=$2
+            shift
+            ;;
+        -d | --databel )
+            makeDA=true
+            ;;
+        -m | --mmscore )
+            invsigma=true
+            ;;
+        -* )
+            echo "$0: invalid option $1" >&2
+            exit 1
+            ;;
+        *)
+            break
+            ;;
+    esac
+    shift
+done
+
+base=mach1.mldose
+dosefvi=$base.fvi
+dosefvd=$base.fvd
+dosetxt=$base
+infofile=mach1.out.mlinfo
+fvbase=mach1.dose.large
+
+
+largedose=$dosetxt.snps
+largerdose=$dosetxt.large
+if [ -f $largedose ]; then
+    rm $largedose
+fi
+if [ -f $largerdose ]; then
+    rm $largerdose
+fi
+
+# Calculate the new number of IDs and SNPs
+nsnps=$(echo "5000 * $SNPmultfact" | bc)
+nids=$(echo "100 * $IDmultfact" | bc)
+
+# Add SNPs by copying the existing columns
+echo "Increasing the number of SNPS with a factor ${SNPmultfact} to ${nsnps}..."
+gawk -v mf=$SNPmultfact '{
+      line="";
+      printf("%s%s", $1, OFS);
+      $1="";
+      for (i=1; i<=mf; i++) {line = line $0};
+      print line}' \
+    $dosetxt | sed 's/  / /g' >> $largedose
+
+
+# Adding the SNPs to the info file
+echo "Extending info file..."
+largeinfo="$(basename $infofile .mlinfo).large.mlinfo"
+if [ -f $largeinfo ]; then
+    rm $largeinfo
+fi
+
+## Wrap the whole SNP name creation in a while loop so it can be rerun
+## if the random name generation happened to result in identical
+## names.
+uniquesnps=1
+while [ $uniquesnps -lt $nsnps ]; do
+    for i in $(seq 1 $SNPmultfact); do
+        gawk -v i=$i 'BEGIN{srand()}
+                  NR==1 {print};
+                  NR!=1 {rnd = int(i * 10000 * rand());
+                         rs = $1 rnd;
+                         $1 = rs;
+                         print}' \
+            $infofile >> $largeinfo
+    done
+
+    # Remove header lines except the first
+    sed -i '1!{/^SNP/d;}' $largeinfo
+
+    uniquesnps=$(cut -d " " -f 1 $largeinfo | sort -u | wc -l)
+done
+
+
+# Add individuals by copying the existing rows
+# Make it a 'real' MaCH output file by re-inserting the 1->id MLDOSE
+# columns
+echo "Increasing the sample size with a factor ${IDmultfact} to ${nids}..."
+for i in $(seq 1 $IDmultfact); do
+    gawk -v i=$i '{id = (i-1)*100 + $1;
+                   $1 = id "->id" sprintf("%04d", id) " MLDOSE";
+                   print $0}' \
+        $largedose >> $largerdose
+done
+
+echo "Creating a matching phenotype file with a random QT..."
+gawk -F"-" 'BEGIN{srand(); print "id qt"}
+            {print "id" sprintf("%04d", $1), rand()}' \
+    $largerdose > qt.PHE
+
+echo "Creating a matching phenotype file with a random BT..."
+gawk -F"-" 'BEGIN{srand(); print "id bt"}
+            {print "id" sprintf("%04d", $1), sprintf("%.0f", rand())}' \
+    $largerdose > bt.PHE
+
+echo "Creating a matching phenotype file with random Cox PH data..."
+gawk -F"-" 'BEGIN{srand(); print "id fupt status cov"}
+            {print "id" sprintf("%04d", $1),
+                  10 * rand(),
+                  sprintf("%.0f", rand()),
+                  rand()}' \
+    $largerdose > cox.PHE
+
+
+
+echo -n "Dimensions of the final data set: "
+gawk 'BEGIN{
+          ncol = 0
+          warning = ""
+      }
+
+      {
+          if (ncol !=0 && ncol != NF)
+          {
+              warning = warning "Row " NR """ nr. of columns (" NF ") not equal to previous line (" ncol ")\n"
+          }
+          ncol = NF
+      }
+
+      END{
+          print NR, ncol
+          if (warning != "")
+          {
+              print warning > "/dev/stderr"
+              exit 1
+          }
+      }' $largerdose
+
+
+## Make a DatABEL/filevector file
+if [ $makeDA == true ]; then
+    echo
+    echo "Converting to DatABEL/filevector format using R..."
+    R --vanilla -q --slave \
+        -f convert2databel.R \
+        --args $largeinfo $largerdose $fvbase 1> /dev/null
+fi
+
+## Create an inverse sigma file
+if [ $invsigma == true ]; then
+    echo "Creating inverse-sigma file ${invsigmafile}..."
+    invsigmafile=invsigma.dat
+    gawk -v nids=$nids \
+        'BEGIN {
+          srand();
+          for(i=1; i<=nids; i++) {
+             for(j=1; j<=i; j++) {
+                is[i, j] = is[j, i] = rand() * 1e-5
+             }
+          }
+         }
+
+        NR!=1 {
+          $2="";
+          for(i=1; i<=nids; i++) {
+            $2 = $2 OFS is[NR-1, i]
+          }
+          print $0
+        }' qt.PHE > $invsigmafile
+fi
+
+
+## Clean up intermediate files
+rm $largedose


Property changes on: pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh
___________________________________________________________________
Added: svn:executable
   + *

Added: pkg/GenABEL-general/create_data_scripts/mach1.mldose
===================================================================
--- pkg/GenABEL-general/create_data_scripts/mach1.mldose	                        (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/mach1.mldose	2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,100 @@
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/genabel -r 1685