[Genabel-commits] r1685 - in pkg/GenABEL-general: . create_data_scripts
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Mon Apr 7 22:34:25 CEST 2014
Author: lckarssen
Date: 2014-04-07 22:34:25 +0200 (Mon, 07 Apr 2014)
New Revision: 1685
Added:
pkg/GenABEL-general/create_data_scripts/
pkg/GenABEL-general/create_data_scripts/Makefile
pkg/GenABEL-general/create_data_scripts/README
pkg/GenABEL-general/create_data_scripts/convert2databel.R
pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh
pkg/GenABEL-general/create_data_scripts/mach1.mldose
pkg/GenABEL-general/create_data_scripts/mach1.out.mlinfo
Log:
This commit adds a directory with a script that creates a large data set, e.g. for profiling of ProbABEL or DatABEL.
See the README file for more details.
Added: pkg/GenABEL-general/create_data_scripts/Makefile
===================================================================
--- pkg/GenABEL-general/create_data_scripts/Makefile (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/Makefile 2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,9 @@
+phenofiles = bt.PHE cox.PHE qt.PHE
+machfiles = mach1.mldose.large mach1.out.large.mlinfo
+fvfiles = mach1.dose.large.fvd mach1.dose.large.fvi
+invsigfile = invsigma.dat
+
+.PHONY: clean
+
+clean:
+ $(RM) $(phenofiles) $(machfiles) $(fvfiles) $(invsigfile)
Added: pkg/GenABEL-general/create_data_scripts/README
===================================================================
--- pkg/GenABEL-general/create_data_scripts/README (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/README 2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,33 @@
+The enlarge_data_set.sh script uses the provided mach1.mldose and
+mach1.out.mlinfo files to create large data sets. These can then be
+used for testing purposes.
+
+These two provided files contain data for 100 individuals and 5000
+SNPs that are copied m_id and m_SNP times by the script. NOTE: the
+mach1.mldose file is not a proper MaCH output file. It doesn't contain
+the signature -> and MLDOSE.
+
+By default the mutiplication factors m_id and m_SNP are set to 5. The
+can be overridden by the command line options -i and -s,
+respectively.
+
+The other command line options are:
+ -d to convert the enlarged genotype file to DatABEL/filevector files
+ as well. NOTE: this requires R and the GenABEL and DatABEL
+ packages to be installed.
+ -m to create an inverse sigma matrix for use with the mmscore
+ option of ProbABEL.
+
+
+The script also creates a set of phenotype files for each of the three
+ProbABEL binaries (palinear, palogist and pacoxph).
+
+The Makefile is only used for cleaning up (i.e. removal of the created
+files). Consequently, it only has one target: clean.
+
+
+TODO:
+- insert random NAs in the phenotype files
+- insert random NAs in the genotype file
+- add covariates to the phenotype files
+- add some error checking when parsing the command line options
Added: pkg/GenABEL-general/create_data_scripts/convert2databel.R
===================================================================
--- pkg/GenABEL-general/create_data_scripts/convert2databel.R (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/convert2databel.R 2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,23 @@
+## Convert dose files to DatABEL format
+##
+## Time-stamp: <2014-03-14 23:14:30 (L.C. Karssen)>
+##
+## This script requires three command line arguments in the following
+## order:
+## - the name of the info file
+## - the name of the dosage file (MaCH format)
+## - the name of the output file
+##
+## Run the script like this:
+## R --vanilla -q -f convert2databel.R --args file.info file.dose file.out
+
+library(DatABEL)
+library(GenABEL)
+
+args <- commandArgs(TRUE)
+infofile <- args[1]
+dosefile <- args[2]
+outfile <- args[3]
+
+mach2databel(dosefile, infofile, outfile, isprobfile = FALSE,
+ dataOutType = "FLOAT")
Added: pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh
===================================================================
--- pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh 2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,191 @@
+#!/bin/bash
+#
+# Time-stamp: <2014-04-07 22:19:09 (L.C. Karssen)>
+#
+# This script takes the GenABEL tutorial example data set for imputed
+# data and enlarges it to make it a bit more like a real analysis.
+
+
+## Parse command line options
+SNPmultfact=5
+IDmultfact=5
+makeDA=false
+invsigma=false
+while [ $# -gt 0 ]; do
+ case $1 in
+ -s | --snpfact )
+ SNPmultfact=$2
+ shift
+ ;;
+ -i | --idfactor )
+ IDmultfact=$2
+ shift
+ ;;
+ -d | --databel )
+ makeDA=true
+ ;;
+ -m | --mmscore )
+ invsigma=true
+ ;;
+ -* )
+ echo "$0: invalid option $1" >&2
+ exit 1
+ ;;
+ *)
+ break
+ ;;
+ esac
+ shift
+done
+
+base=mach1.mldose
+dosefvi=$base.fvi
+dosefvd=$base.fvd
+dosetxt=$base
+infofile=mach1.out.mlinfo
+fvbase=mach1.dose.large
+
+
+largedose=$dosetxt.snps
+largerdose=$dosetxt.large
+if [ -f $largedose ]; then
+ rm $largedose
+fi
+if [ -f $largerdose ]; then
+ rm $largerdose
+fi
+
+# Calculate the new number of IDs and SNPs
+nsnps=$(echo "5000 * $SNPmultfact" | bc)
+nids=$(echo "100 * $IDmultfact" | bc)
+
+# Add SNPs by copying the existing columns
+echo "Increasing the number of SNPS with a factor ${SNPmultfact} to ${nsnps}..."
+gawk -v mf=$SNPmultfact '{
+ line="";
+ printf("%s%s", $1, OFS);
+ $1="";
+ for (i=1; i<=mf; i++) {line = line $0};
+ print line}' \
+ $dosetxt | sed 's/ / /g' >> $largedose
+
+
+# Adding the SNPs to the info file
+echo "Extending info file..."
+largeinfo="$(basename $infofile .mlinfo).large.mlinfo"
+if [ -f $largeinfo ]; then
+ rm $largeinfo
+fi
+
+## Wrap the whole SNP name creation in a while loop so it can be rerun
+## if the random name generation happened to result in identical
+## names.
+uniquesnps=1
+while [ $uniquesnps -lt $nsnps ]; do
+ for i in $(seq 1 $SNPmultfact); do
+ gawk -v i=$i 'BEGIN{srand()}
+ NR==1 {print};
+ NR!=1 {rnd = int(i * 10000 * rand());
+ rs = $1 rnd;
+ $1 = rs;
+ print}' \
+ $infofile >> $largeinfo
+ done
+
+ # Remove header lines except the first
+ sed -i '1!{/^SNP/d;}' $largeinfo
+
+ uniquesnps=$(cut -d " " -f 1 $largeinfo | sort -u | wc -l)
+done
+
+
+# Add individuals by copying the existing rows
+# Make it a 'real' MaCH output file by re-inserting the 1->id MLDOSE
+# columns
+echo "Increasing the sample size with a factor ${IDmultfact} to ${nids}..."
+for i in $(seq 1 $IDmultfact); do
+ gawk -v i=$i '{id = (i-1)*100 + $1;
+ $1 = id "->id" sprintf("%04d", id) " MLDOSE";
+ print $0}' \
+ $largedose >> $largerdose
+done
+
+echo "Creating a matching phenotype file with a random QT..."
+gawk -F"-" 'BEGIN{srand(); print "id qt"}
+ {print "id" sprintf("%04d", $1), rand()}' \
+ $largerdose > qt.PHE
+
+echo "Creating a matching phenotype file with a random BT..."
+gawk -F"-" 'BEGIN{srand(); print "id bt"}
+ {print "id" sprintf("%04d", $1), sprintf("%.0f", rand())}' \
+ $largerdose > bt.PHE
+
+echo "Creating a matching phenotype file with random Cox PH data..."
+gawk -F"-" 'BEGIN{srand(); print "id fupt status cov"}
+ {print "id" sprintf("%04d", $1),
+ 10 * rand(),
+ sprintf("%.0f", rand()),
+ rand()}' \
+ $largerdose > cox.PHE
+
+
+
+echo -n "Dimensions of the final data set: "
+gawk 'BEGIN{
+ ncol = 0
+ warning = ""
+ }
+
+ {
+ if (ncol !=0 && ncol != NF)
+ {
+ warning = warning "Row " NR """ nr. of columns (" NF ") not equal to previous line (" ncol ")\n"
+ }
+ ncol = NF
+ }
+
+ END{
+ print NR, ncol
+ if (warning != "")
+ {
+ print warning > "/dev/stderr"
+ exit 1
+ }
+ }' $largerdose
+
+
+## Make a DatABEL/filevector file
+if [ $makeDA == true ]; then
+ echo
+ echo "Converting to DatABEL/filevector format using R..."
+ R --vanilla -q --slave \
+ -f convert2databel.R \
+ --args $largeinfo $largerdose $fvbase 1> /dev/null
+fi
+
+## Create an inverse sigma file
+if [ $invsigma == true ]; then
+ echo "Creating inverse-sigma file ${invsigmafile}..."
+ invsigmafile=invsigma.dat
+ gawk -v nids=$nids \
+ 'BEGIN {
+ srand();
+ for(i=1; i<=nids; i++) {
+ for(j=1; j<=i; j++) {
+ is[i, j] = is[j, i] = rand() * 1e-5
+ }
+ }
+ }
+
+ NR!=1 {
+ $2="";
+ for(i=1; i<=nids; i++) {
+ $2 = $2 OFS is[NR-1, i]
+ }
+ print $0
+ }' qt.PHE > $invsigmafile
+fi
+
+
+## Clean up intermediate files
+rm $largedose
Property changes on: pkg/GenABEL-general/create_data_scripts/enlarge_data_set.sh
___________________________________________________________________
Added: svn:executable
+ *
Added: pkg/GenABEL-general/create_data_scripts/mach1.mldose
===================================================================
--- pkg/GenABEL-general/create_data_scripts/mach1.mldose (rev 0)
+++ pkg/GenABEL-general/create_data_scripts/mach1.mldose 2014-04-07 20:34:25 UTC (rev 1685)
@@ -0,0 +1,100 @@
[TRUNCATED]
To get the complete diff run:
svnlook diff /svnroot/genabel -r 1685
More information about the Genabel-commits
mailing list