[Fingerprint-commits] r2 - in pkg: . R man src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Fri Oct 24 23:02:21 CEST 2008


Author: rajarshi
Date: 2008-10-24 23:02:21 +0200 (Fri, 24 Oct 2008)
New Revision: 2

Added:
   pkg/DESCRIPTION
   pkg/INDEX
   pkg/NAMESPACE
   pkg/R/bitspec.R
   pkg/R/fingerprint.R
   pkg/R/matrix.R
   pkg/R/misc.R
   pkg/R/ops.R
   pkg/R/read.R
   pkg/R/zzz.R
   pkg/README
   pkg/man/bitspec.Rd
   pkg/man/distance.Rd
   pkg/man/facmat.Rd
   pkg/man/fingerprint.Rd
   pkg/man/fold.Rd
   pkg/man/fplogical.Rd
   pkg/man/length.Rd
   pkg/man/linefunc.Rd
   pkg/man/mat.Rd
   pkg/man/read.Rd
   pkg/man/rndfp.Rd
   pkg/man/show.Rd
   pkg/man/sim.Rd
   pkg/man/string.Rd
   pkg/man/vec.Rd
   pkg/src/
   pkg/src/fpdistance.c
Log:
Added all the sources from my local repo. Unfortunately, lost the history

Added: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION	                        (rev 0)
+++ pkg/DESCRIPTION	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,19 @@
+Package: fingerprint
+Version: 3.1
+Date: 2008-10-17
+Title: Functions to operate on binary fingerprint data
+Author: Rajarshi Guha <rguha at indiana.edu>
+Maintainer: Rajarshi Guha <rguha at indiana.edu>
+Description: This package contains functions to manipulate binary fingerprints
+ of arbitrary length. A fingerprint is represented by an object of S4 class 'fingerprint'
+ which is internally represented a vector of integers, such
+ that each element represents the position in the fingerprint that is set to 1.
+ The bitwise logical functions in R are overridden so that they can be used directly
+ with 'fingerprint' objects. A number of distance metrics are also
+ available (many contributed by Michael Fadock). Fingerprints 
+ can be converted to Euclidean vectors (i.e., points on the unit hypersphere) and
+ can also be folded using OR.  Arbitrary fingerprint formats can be handled via line
+ handlers. Currently handlers are provided for CDK, MOE and BCI fingerprint data.
+License: GPL
+Depends: methods
+LazyLoad: yes

Added: pkg/INDEX
===================================================================
--- pkg/INDEX	                        (rev 0)
+++ pkg/INDEX	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,19 @@
+!                       Logical Operators for Fingerprints
+as.character            Generates a String Representation of a
+                        Fingerprint
+cdk.lf                  Functions to parse lines from fingerprint files
+distance                Calculates the Similarity or Dissimilarity
+                        Between Two Fingerprints
+euc.vector              Euclidean Representation of Binary Fingerprints
+fingerprint-class       Class "fingerpint"
+fold                    Fold a fingerprint
+fp.factor.matrix        Converts a List of Fingerprints to a data.frame
+                        of Factors
+fp.read                 Functions to Read Fingerprints From Files
+fp.sim.matrix           Calculates a Similarity Matrix for a Set of
+                        Fingerprints
+fp.to.matrix            Converts a List of Fingerprints to a Matrix
+length                  Fingerprint Bit Length
+random.fingerprint      Generate Randomized Fingerprints
+show,fingerprint-method
+                        String Representation of a Fingerprint

Added: pkg/NAMESPACE
===================================================================
--- pkg/NAMESPACE	                        (rev 0)
+++ pkg/NAMESPACE	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,5 @@
+importFrom("methods")
+exportClasses("fingerprint")
+exportMethods("fold", "euc.vector", "distance", "random.fingerprint", "as.character", "length", "show")
+export("fp.sim.matrix", "fp.to.matrix", "fp.factor.matrix", "fp.read.to.matrix", "fp.read", "moe.lf", "bci.lf", "cdk.lf", "bit.spectrum")
+useDynLib(fingerprint)

Added: pkg/R/bitspec.R
===================================================================
--- pkg/R/bitspec.R	                        (rev 0)
+++ pkg/R/bitspec.R	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,12 @@
+bit.spectrum <- function(fplist) {
+  if (class(fplist) != 'list') stop("Must provide a list of fingerprint objects")
+  if (any(unlist(lapply(fplist, class)) != 'fingerprint'))
+    stop("Must provide a list of fingerprint objects");
+  nbit <- length(fplist[[1]])
+  spec <- numeric(nbit)
+  for (i in 1:length(fplist)) {
+    bits <- fplist[[i]]@bits
+    spec[bits] <- spec[bits]+1
+  }
+  spec / length(fplist)
+}

Added: pkg/R/fingerprint.R
===================================================================
--- pkg/R/fingerprint.R	                        (rev 0)
+++ pkg/R/fingerprint.R	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,69 @@
+setClass("fingerprint",
+         representation(bits="numeric",
+                        nbit="numeric",
+                        folded="logical",
+                        provider="character",
+                        name="character"),
+         validity=function(object) {
+           if (any(object at bits > object at nbit))
+             return("Bit positions were greater than the specified bit length")
+           else return(TRUE)           
+         },
+         prototype(bits=c(),
+                   nbit=0,
+                   folded=FALSE,
+                   provider="",
+                   name=""))
+
+#setGeneric("show", function(object) standardGeneric("show"))
+setMethod("show", "fingerprint",
+          function(object) {
+            cat("Fingerprint object\n")
+            cat(" name = ", object at name, "\n")
+            cat(" length = ", object at nbit, "\n")
+            cat(" folded = ", object at folded, "\n")
+            cat(" source = ", object at provider, "\n")
+            cat(" bits on = ", paste(sort(object at bits), collapse=' '), "\n")
+          })
+
+
+setMethod('as.character', "fingerprint",
+          function(x) {
+            s <- numeric(x at nbit)
+            s[x at bits] <- 1
+            paste(s,sep='',collapse='')
+          })
+
+setMethod("length", "fingerprint",
+          function(x) {
+            x at nbit
+          })
+
+parseCall <- function (obj) 
+{
+    if (class(obj) != "call") {
+        stop("Must supply a 'call' object")
+    }
+    srep <- deparse(obj)
+    if (length(srep) > 1) 
+        srep <- paste(srep, sep = "", collapse = "")
+    fname <- unlist(strsplit(srep, "\\("))[1]
+    func <- unlist(strsplit(srep, paste(fname, "\\(", sep = "")))[2]
+    func <- unlist(strsplit(func, ""))
+    func <- paste(func[-length(func)], sep = "", collapse = "")
+    func <- unlist(strsplit(func, ","))
+    vals <- list()
+    nms <- c()
+    cnt <- 1
+    for (args in func) {
+        arg <- unlist(strsplit(args, "="))[1]
+        val <- unlist(strsplit(args, "="))[2]
+        arg <- gsub(" ", "", arg)
+        val <- gsub(" ", "", val)
+        vals[[cnt]] <- val
+        nms[cnt] <- arg
+        cnt <- cnt + 1
+    }
+    names(vals) <- nms
+    vals
+}

Added: pkg/R/matrix.R
===================================================================
--- pkg/R/matrix.R	                        (rev 0)
+++ pkg/R/matrix.R	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,32 @@
+fp.sim.matrix <- function(fplist, method='tanimoto') {
+  size <- fplist[[1]]@nbit
+  
+  sim <- matrix(0,nr=length(fplist), nc=length(fplist))
+  for (i in 1:(length(fplist)-1)) {
+    v <- unlist(lapply( fplist[(i+1):length(fplist)], distance, fp2=fplist[[i]], method=method))
+    sim[i,(i+1):length(fplist)] <- v
+    sim[(i+1):length(fplist),i] <- v
+  }
+  diag(sim) <- 1.0
+  sim
+}
+
+## Takes the fingerprints, P bits,  for a set of N molecules supplied as
+## a list structure and creates an N x P matrix
+fp.to.matrix <- function( fplist ) {
+  size <- fplist[[1]]@nbit
+  m <- matrix(0, nr=length(fplist), nc=size)
+  cnt <- 1
+  for ( i in fplist ) {
+    m[cnt,i at bits] <- 1
+    cnt <- cnt + 1
+  }
+  m
+}
+
+fp.factor.matrix <- function( fplist ) {
+  size <- fplist[[1]]@nbit
+  m <- data.frame(fp.to.matrix(fplist))
+  m[] <- lapply(m, factor, levels=0:1)
+  m
+}

Added: pkg/R/misc.R
===================================================================
--- pkg/R/misc.R	                        (rev 0)
+++ pkg/R/misc.R	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,188 @@
+setGeneric("fold", function(fp) standardGeneric("fold"))
+setMethod("fold", "fingerprint",
+          function(fp) {
+            size <- fp at nbit
+            if (size %% 2 != 0) {
+              stop('Need to supply a fingerprint of even numbered length')
+            }
+            bfp <- rep(FALSE, size)
+            bfp[fp at bits] <- TRUE
+
+            subfplen <- size/2
+            
+            b1 <- which(bfp[1:subfplen])
+            b2 <- which(bfp[(subfplen+1):size])
+            
+            subfp1 <- new("fingerprint",
+                          nbit=subfplen,
+                          bits=b1,
+                          provider="R");
+            
+            subfp2 <- new("fingerprint",
+                          nbit=subfplen,
+                          bits=b2,
+                          provider="R")
+            foldedfp <- subfp1 | subfp2
+            foldedfp at folded <- TRUE
+            return(foldedfp)
+          })
+
+setGeneric("euc.vector", function(fp) standardGeneric("euc.vector"))
+setMethod("euc.vector", "fingerprint",
+          function(fp) {
+            coord <- rep(0,length(fp))
+            coord[fp at bits] <- 1.0 / sqrt(length(fp))
+            coord
+          })
+
+
+setGeneric("distance", function(fp1,fp2,method) standardGeneric("distance"))
+setMethod("distance", c("fingerprint", "fingerprint", "missing"),
+          function(fp1,fp2) {
+            distance(fp1,fp2,"tanimoto")
+          })
+setMethod("distance", c("fingerprint", "fingerprint", "character"),
+          function(fp1,fp2, method=c('tanimoto', 'euclidean', 'mt',
+                              'simple', 'jaccard', 'dice',
+                              'russelrao', 'rodgerstanimoto','cosine',
+                              'achiai', 'carbo', 'baroniurbanibuser',
+                              'kulczynski2',
+                              
+                              'hamming', 'meanHamming', 'soergel',
+                              'patternDifference', 'variance', 'size', 'shape',
+
+                              'hamann', 'yule', 'pearson', 'dispersion',
+                              'mcconnaughey', 'stiles',
+
+                              'simpson', 'petke',
+                              'stanimoto', 'seuclidean'
+                              )) {
+            
+            if ( length(fp1) != length(fp2))
+              stop("Fingerprints must of the same bit length")
+            
+            method <- match.arg(method)
+            n <- length(fp1)
+
+            if (method == 'tanimoto') {
+              f1 <- numeric(n)
+              f2 <- numeric(n)
+              f1[fp1 at bits] <- 1
+              f2[fp2 at bits] <- 1
+              sim <- 0.0
+              ret <-  .C("fpdistance", as.double(f1), as.double(f2),
+                         as.integer(n), as.integer(1),
+                         as.double(sim),
+                         PACKAGE="fingerprint")
+              return (ret[[5]])
+            } else if (method == 'euclidean') {
+              f1 <- numeric(n)
+              f2 <- numeric(n)
+              f1[fp1 at bits] <- 1
+              f2[fp2 at bits] <- 1
+              sim <- 0.0
+              ret <-  .C("fpdistance", as.double(f1), as.double(f1),
+                         as.integer(n), as.integer(2),
+                         as.double(sim),
+                         PACKAGE="fingerprint")
+              return (ret[[5]])
+            }
+
+            size <- n
+
+            ## in A & B
+            tmp <- fp1 & fp2
+            c <- length(tmp at bits)
+
+            ## in A not in B
+            tmp <- (fp1 | fp2) & !fp2
+            a <- length(tmp at bits)
+
+            ## in B not in A
+            tmp <- (fp1 | fp2) & !fp1
+            b <- length(tmp at bits)
+
+            ## not in A, not in B
+            tmp <- !(fp1 | fp2)
+            d <- length(tmp at bits)
+
+            dist <- NULL
+
+            ## Simlarity
+            if (method == 'stanimoto') {
+              dist <- c / (a+b+c)
+            } else if (method == 'seuclidean') {
+              dist <- sqrt((d+c) / (a+b+c+d))
+            } else if (method == 'dice') {
+              dist <- c / (.5*a + .5*b + c)
+            } else if (method == 'mt') {
+              t1 <- c/(size-d)
+              t0 <- d/(size-c)
+              phat <- ((size-d) + c)/(2*size)
+              dist <- (2-phat)*t1/3 + (1+phat)*t0/3
+            } else if (method == 'simple') {
+              dist <- (c+d)/n
+            } else if (method == 'jaccard') {
+              dist <- c/(a+b+c)
+            } else if (method == 'russelrao') {
+              dist <- c/size
+            } else if (method == 'rodgerstanimoto') {
+              dist <- (c+d)/(2*a+2*b+c+d)
+            } else if (method == 'cosine' || method == 'achiai' || method == 'carbo') {
+              dist <- c/sqrt((a+c)*(b+c))
+            } else if (method == 'baroniurbanibuser') {
+              dist <- (sqrt(c*d)+c)/(sqrt(c*d)+a+b+c)
+            } else if (method == 'kulczynski2') {
+              dist <- .5*(c/(a+c)+c/(b+c))              
+            }
+            ## Dissimilarity
+            else if (method == 'hamming') {
+              dist <- a+b
+            } else if (method == 'meanHamming') {
+              dist <- (a+b)/(a+b+c+d)
+            }else if (method == 'soergel') {
+              dist <- (a+b)/(a+b+c)
+            } else if (method == 'patternDifference') {
+              dist <- (a*b)/(a+b+c+d)^2
+            } else if (method == 'variance') {
+              dist <- (a+b)/(4*n)
+            } else if (method == 'size') {
+              dist <-  (a-b)^2/n^2
+            } else if (method == 'shape') {
+              dist <- (a+b)/n-((a-b)/(n))^2
+            }
+
+            ## Composite
+            else if (method == 'hamann') {
+              dist <- (c+d-a-b)/(a+b+c+d)
+            } else if (method == 'yule') {
+              dist <-  (c*d-a*b)/(c*d+a*b)
+            } else if (method == 'pearson') {
+              dist <- (c*d-a*b)/sqrt((a+c)*(b+c)*(a+d)*(b+d))
+            } else if (method == 'dispersion') {
+              dist <- (c*d-a*b)/n^2
+            } else if (method == 'mcconaughey') {
+              dist <- (c^2-a*b)/((a+c)*(b+c))
+            } else if (method == 'stiles') {
+              dist <- log10(n*(abs(c*d-a*b)-n/2)^2/((a+c)*(b+c)*(a+d)*(b+d)))
+            }
+
+            ## Asymmetric
+            else if (method == 'simpson') {
+              dist <- c/min((a+c),(b+c))
+            } else if (method == 'petke') {
+              dist <- c/max((a+c),(b+c))
+            }
+            
+            dist
+          })
+
+setGeneric("random.fingerprint",
+           function(nbit, on) standardGeneric("random.fingerprint"))
+setMethod("random.fingerprint", c("numeric", "numeric"),
+          function(nbit, on) {
+            if (nbit <= 0) stop("Bit length must be positive integer")
+            if (on <= 0) stop("Number of bits to be set to 1 must be positive integer")            
+            bits <- sample(1:nbit, size=on)
+            new("fingerprint", nbit=nbit, bits=bits, provider="R", folded=FALSE)
+          })

Added: pkg/R/ops.R
===================================================================
--- pkg/R/ops.R	                        (rev 0)
+++ pkg/R/ops.R	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,54 @@
+setMethod("&", c("fingerprint", "fingerprint"),
+          function(e1, e2) {
+            if (e1 at nbit != e2 at nbit)
+              stop("fp1 & fp2 must of the same bit length")
+            
+            andbits <- intersect(e1 at bits, e2 at bits)
+            new("fingerprint",
+                bits=andbits,
+                nbit=e1 at nbit,
+                provider="R")
+          })
+
+setMethod("|", c("fingerprint", "fingerprint"),
+          function(e1, e2) {
+            if (e1 at nbit != e2 at nbit)
+              stop("fp1 & fp2 must of the same bit length")
+            
+            orbits <- union(e1 at bits, e2 at bits)
+            new("fingerprint",
+                bits=orbits,
+                nbit=e1 at nbit,
+                provider="R")
+          })
+
+setMethod("!", c("fingerprint"),
+          function(x) {
+            bs <- 1:(x at nbit)
+            if (length(x at bits) > 0) b <- bs[ -x at bits ]
+            else b <- bs
+            ret <- new("fingerprint",
+                       bits=b,
+                       nbit=x at nbit,
+                       provider="R")
+            return(ret)
+          })
+
+setMethod("xor", c("fingerprint", "fingerprint"),
+          function(x,y) {
+            if (x at nbit != y at nbit)
+              stop("e1 & e2 must of the same bit length")
+
+            tmp1 <- rep(FALSE, x at nbit)
+            tmp2 <- rep(FALSE, y at nbit)
+            tmp1[x at bits] <- TRUE
+            tmp2[y at bits] <- TRUE
+            tmp3 <- xor(tmp1,tmp2)
+            xorbits <- which(tmp3)
+            
+            new("fingerprint",
+                bits=xorbits,
+                nbit=x at nbit,
+                provider="R")
+          })
+

Added: pkg/R/read.R
===================================================================
--- pkg/R/read.R	                        (rev 0)
+++ pkg/R/read.R	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,53 @@
+cdk.lf <- function(line) {
+  p <- regexpr("{([0-9,\\s]*)}",line,perl=T)
+  s <- gsub(',','',substr(line, p+1, p+attr(p,"match.length")-2))
+  s <- lapply( strsplit(s,' '), as.numeric )
+  list(NA, s[[1]])
+}
+
+moe.lf <- function(line) {
+  p <- regexpr("\"([0-9\\s]*)\"",line, perl=T)
+  s <- substr(line, p+1, p+attr(p,"match.length")-2)
+  s <- lapply( strsplit(s,' '), as.numeric )
+  list(NA, s[[1]])
+}
+
+bci.lf <- function(line) {
+  tokens <- strsplit(line, '\\s')[[1]]
+  name <- tokens[1]
+  tokens <- tokens[-c(1, length(tokens), length(tokens)-1)]
+  list(name, as.numeric(tokens))
+}
+
+fp.read <- function(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE) {
+  provider <- parseCall(match.call())$lf
+  
+  fplist <- list()
+  fcon <- file(description=f,open='r')
+  lines = readLines(fcon,n=-1)
+  if (header) lines = lines[-1]
+  c = 1
+  for (line in lines) {
+    dat <- lf(line)
+    if (is.na(dat[[1]])) name <- ""
+    else name <- dat[[1]]
+    
+    fplist[[c]] <- new("fingerprint",
+                       nbit=size,
+                       bits=as.numeric(dat[[2]]),
+                       folded=FALSE,
+                       provider=provider,
+                       name=name)
+    c <- c+1
+  }
+  close(fcon)
+  fplist
+}
+
+# Need to supply the length of the bit string since fp.read does
+# not provide that information
+fp.read.to.matrix <- function(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE) {
+    fplist <- fp.read(f, size, lf, header)
+    fpmat <- fp.to.matrix(fplist)
+    fpmat
+  }

Added: pkg/R/zzz.R
===================================================================
--- pkg/R/zzz.R	                        (rev 0)
+++ pkg/R/zzz.R	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1 @@
+.onLoad <- function(lib, pkg) require(methods)

Added: pkg/README
===================================================================
--- pkg/README	                        (rev 0)
+++ pkg/README	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,10 @@
+1. Put any C/C++/Fortran code in 'src'
+2. If you have compiled code, add a .First.lib() function in 'R'
+   to load the shared library
+3. Edit the help file skeletons in 'man'
+4. Run R CMD build to create the index files
+5. Run R CMD check to check the package
+6. Run R CMD build to make the package file
+
+
+Read "Writing R Extensions" for more information.

Added: pkg/man/bitspec.Rd
===================================================================
--- pkg/man/bitspec.Rd	                        (rev 0)
+++ pkg/man/bitspec.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,46 @@
+\name{bit.spectrum}
+\alias{bit.spectrum}
+\title{
+	Generate a Bit Spectrum from a List of Fingerprints
+}
+\description{
+The idea of comparing datasets using fingerprints was described in
+Guha \& Schurer (2008). The idea is that one can summarize the dataset
+by counting the frequency of occurrence of each bit position. The
+frequency is normalized by the number of fingerprints considered. Thus a
+collection of N fingerprints can be converted to a single vector of
+numbers highlighting the most frequent bits with respect to a given
+dataset. A plot of this vector looks like a traditional spectrum and
+hence the name.
+
+The bit spectra for two datasets (assuming that the same types of
+fingerprints have been used) allows one to compare the similarity of
+the datasets, without having to do a full pairwise similarity
+calculation. The difference between the structural features of the
+datasets can be quantified by evaluating the distance between the two
+bit spectra.
+}
+\usage{
+bit.spectrum(fplist)
+}
+\arguments{
+  \item{fplist}{
+        A list structure with each element being an object of class
+	\code{fingerprint}. These will can be constructed by hand or
+	read from disk via \code{\link{fp.read}}.
+
+	All fingerprints in the list should be of the same length.
+    }
+}
+\value{
+A numeric vector of length equal to the size of the fingerprints.
+}
+\seealso{
+    \code{\link{distance}}, \code{\link{fp.read}}
+}
+\references{
+Guha, R.; Schurer, S.; \emph{J. Comp. Aid. Molec. Des.}, \bold{2008},
+    \emph{22}, 367-384.
+}
+\keyword{programming}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}

Added: pkg/man/distance.Rd
===================================================================
--- pkg/man/distance.Rd	                        (rev 0)
+++ pkg/man/distance.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,129 @@
+\name{distance}
+\alias{distance}
+\title{
+Calculates the Similarity or Dissimilarity Between Two Fingerprints
+}
+\description{
+  A number of distance metrics can be calculated for binary
+  fingerprints. Some of these are actually similarity metrics and
+  thus represent the reverse of a distance metric.
+
+  The following are distance (dissimilarity) metrics
+  \itemize{
+    \item Hamming
+    \item Mean Hamming
+    \item Soergel
+    \item Pattern Difference
+    \item Variance
+    \item Size
+    \item Shape
+  }
+
+  The following metrics are similarity metrics and so the distance can
+  be obtained by subtracting the value fom 1.0
+  \itemize{
+    \item Tanimoto
+    \item Dice
+    \item Modified Tanimoto
+    \item Simple
+    \item Jaccard
+    \item Russel-Rao
+    \item Rodgers Tanimoto
+    \item Cosine
+    \item Achiai
+    \item Carbo
+    \item Baroniurbanibuser
+    \item Kulczynski2
+  } 
+
+  Finally the method also provides a set of composite and asymmetric
+  distance metrics
+  \itemize{
+    \item Hamann
+    \item Yule
+    \item Pearson
+    \item Dispersion
+    \item McConnaughey
+    \item Stiles
+    \item Simpson
+    \item Petke
+  }
+  The default metric is the Tanimoto coefficient.
+}
+\usage{
+distance(fp1, fp2, method)
+}
+\arguments{
+  \item{fp1}{
+    An object of class \code{fingerprint}
+  }
+  \item{fp2}{
+    An object of class \code{fingerprint}
+  }
+  \item{method}{
+    The type of distance metric desired. Partial matching is
+    supported and the deault is \code{tanimoto}. Alternative values are
+    \itemize{
+      \item \code{euclidean} 
+      \item \code{hamming}
+      \item \code{meanHamming}
+      \item \code{soergel}
+      \item \code{patternDifference}
+      \item \code{variance}
+      \item \code{size}
+      \item \code{shape}
+
+      \item \code{jaccard}
+      \item \code{dice}
+      \item \code{mt}
+      \item \code{simple}
+      \item \code{russelrao}
+      \item \code{rodgerstanimoto}
+      \item \code{cosine}
+      \item \code{achiai}
+      \item \code{carbo}
+      \item \code{baroniurbanibuser}
+      \item \code{kulczynski2}
+
+      \item \code{hamann}
+      \item \code{yule}
+      \item \code{pearson}
+      \item \code{mcconnaughey}
+      \item \code{stiles}
+
+      \item \code{simpson}
+      \item \code{petke}
+
+    }
+
+  }
+}
+\value{
+  Numeric value representing the distance in the specified metric between the
+  supplied fingerprint objects
+}
+\examples{
+# make a 2 fingerprint vectors
+fp1 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+fp2 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+
+# calculate the tanimoto coefficient
+distance(fp1,fp2) # should be 1
+
+# Invert the second fingerprint
+fp3 <- !fp2
+
+distance(fp1,fp3) # should be 0
+}
+
+\references{Fligner, M.A.; Verducci, J.S.; Blower, P.E.;
+  A Modification of the Jaccard-Tanimoto Similarity Index for
+  Diverse Selection of Chemical Compounds Using Binary Strings,
+  \emph{Technometrics}, 2002, \emph{44}(2), 110-119
+
+  Monve, V.; Introduction to Similarity Searching in
+  Chemistry, \emph{MATCH - Comm. Math. Comp. Chem.}, 2004, \emph{51}, 7-38
+}
+
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}

Added: pkg/man/facmat.Rd
===================================================================
--- pkg/man/facmat.Rd	                        (rev 0)
+++ pkg/man/facmat.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,35 @@
+\name{fp.factor.matrix}
+\alias{fp.factor.matrix}
+\title{
+  Converts a List of Fingerprints to a data.frame of Factors
+}
+\description{
+This function will convert  a \code{list} of fingerprint objects
+to a \code{data.frame} of factors with levels 1 and 0.
+}
+\usage{
+fp.factor.matrix(fplist)
+}
+\arguments{
+  \item{fplist}{
+        A list structure with each element being an object of class
+	\code{fingerprint}. These will can be constructed by hand or
+	read from disk via \code{\link{fp.read}}    
+    }
+}
+\value{
+A matrix with dimensions equal to \code{(length(fplist), length(fplist))}
+}
+\seealso{
+    \code{\link{distance}}, \code{\link{fp.read}}
+}
+\examples{
+# make fingerprint objects
+fp1 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+fp2 <- new("fingerprint", nbit=6, bits=c(1,4,5,6))
+fp3 <- new("fingerprint", nbit=6, bits=c(2,3,4,5,6))
+
+fp.factor.matrix( list(fp1,fp2,fp3) )
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}

Added: pkg/man/fingerprint.Rd
===================================================================
--- pkg/man/fingerprint.Rd	                        (rev 0)
+++ pkg/man/fingerprint.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,77 @@
+\name{fingerprint-class}
+\docType{class}
+\alias{fingerprint-class}
+\alias{distance,fingerprint,fingerprint,missing-method}
+\alias{distance,fingerprint,fingerprint,character-method}
+\alias{euc.vector,fingerprint-method}
+\alias{fold,fingerprint-method}
+\alias{random.fingerprint,numeric,numeric-method}
+
+\title{Class "fingerpint"}
+\description{This class represents binary fingerprints, usually
+  generated by a variety of cheminformatics software, but not
+  restricted to such
+  }
+\section{Objects from the Class}{
+Objects can be created by calls of the form \code{new("fingerprint", ...)}.
+	Fingerprints can traditionally thought of as a vector of 1's and
+	0's. However for large fingerprints this is inefficient and
+	instead we simply store the positions of the bits that are
+	on. Certain operations also need to know the length of the
+	original bit string and this length is stored in the object at
+	construction. Even though we store extra information along with
+	the bit positions, conceptually we still consider the objects as
+	simple bit strings. Thus the usual bitwise logical operations
+	(&, |, !, xor) can be applied to objects of this class.
+}
+\section{Slots}{
+	 \describe{
+    \item{\code{bits}:}{Object of class \code{"numeric"} ~~ A vector
+      indicating the bit positions that are on. }
+    \item{\code{nbit}:}{Object of class \code{"numeric"} ~~ Indicates the length of the original bit string.}
+    \item{\code{folded}:}{Object of class \code{"logical"} ~~ Indicates
+      whether the fingerprint has been folded.}
+    \item{\code{provider}:}{Object of class \code{"character"} ~~
+      Indicates the source of the fingerprint. Can be useful to keep
+      track of what software generated the fingerprint.}
+    \item{\code{name}:}{Object of class \code{"character"} ~~
+      The name associated with the fingerprint. If not name is available
+    this gets set to an empty string}
+  }
+}
+\section{Methods}{
+  \describe{
+    \item{distance}{\code{signature(fp1 = "fingerprint", fp2 = "fingerprint", method = "missing")}: ... }
+    \item{distance}{\code{signature(fp1 = "fingerprint", fp2 = "fingerprint", method = "character")}: ... }
+    \item{euc.vector}{\code{signature(fp = "fingerprint")}: ... }
+    \item{fold}{\code{signature(fp = "fingerprint")}: ... }
+    \item{random.fingerprint}{\code{signature(nbit = "numeric", on = "numeric")}: ... }    
+	 }
+}
+\references{}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+\seealso{
+  \code{\link{fp.read}}, \code{\link{fp.read.to.matrix}}
+  \code{\link{fp.sim.matrix}}, \code{\link{fp.to.matrix}},
+  \code{\link{fp.factor.matrix}}
+  \code{\link{random.fingerprint}}
+  }
+
+\examples{
+## make fingerprints
+x <- new("fingerprint", nbit=128, bits=sample(1:128, 100))
+y <- x
+distance(x,y) # should be 1
+x <- new("fingerprint", nbit=128, bits=sample(1:128, 100))
+distance(x,y)
+folded <- fold(x)
+
+## binary operations on fingerprints
+x <- new("fingerprint", nbit=8, bits=c(1,2,3,6,8))
+y <- new("fingerprint", nbit=8, bits=c(1,2,4,5,7,8))
+x & y
+x | y
+!x
+}
+\keyword{classes}
+\keyword{logic}
\ No newline at end of file

Added: pkg/man/fold.Rd
===================================================================
--- pkg/man/fold.Rd	                        (rev 0)
+++ pkg/man/fold.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,33 @@
+
+\name{fold}
+\alias{fold}
+\title{
+  Fold a fingerprint
+}
+\description{
+    In many situations a fingerprint is generated using a large length (such as 1024 bits or more).
+    As a result of this, the fingerprints for a dataset can be very sparse. One approach to increasing 
+    bit density of such fingerprints is to fold them. This is performed by dividing the original
+    fingerprint bitstring into two substrings of equal length and then perform an OR on
+    the two substrings. 
+
+    It should be noted that many fingerprint generating routines will perform this internally.
+}
+\usage{
+fold(fp)
+}
+\arguments{
+    \item{fp}{
+        The fingerprint to fold. Should be of class \code{fingerprint}.
+    }
+}
+\value{
+An object of class \code{fingerprint} representing the folded fingerprint.
+}
+\examples{
+# make a fingerprint vector
+fp <- new("fingerprint", nbit=64, bits=sample(1:64, 30))
+fold(fp)
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}

Added: pkg/man/fplogical.Rd
===================================================================
--- pkg/man/fplogical.Rd	                        (rev 0)
+++ pkg/man/fplogical.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,39 @@
+\name{!,&,|}
+\alias{!}
+\alias{|}
+\alias{&}
+\alias{xor}
+\alias{|,fingerprint,fingerprint-method}
+\alias{&,fingerprint,fingerprint-method}
+\alias{xor,fingerprint,fingerprint-method}
+\alias{!,fingerprint-method}
+\title{
+    Logical Operators for Fingerprints
+}
+\description{
+These functions perform logical operatiosn (AND, OR, NOT, XOR) on the supplied
+binary fingerprints. Thus for two fingerprints A and B we have
+\describe{
+\item{\code{&}}{Logical AND}
+\item{\code{|}}{Logical OR}
+\item{\code{xor}}{Logical XOR}
+\item{\code{!}}{Logical NOT (negation)}
+}
+}
+\arguments{
+    \item{e1}{
+      An object of class \code{fingerprint}
+    }
+    \item{e2}{
+      An object of class \code{fingerprint}
+    }
+  }
+\value{
+A fingerprint object
+}
+\keyword{logic}
+\keyword{methods}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+
+
+

Added: pkg/man/length.Rd
===================================================================
--- pkg/man/length.Rd	                        (rev 0)
+++ pkg/man/length.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,27 @@
+\name{length}
+\alias{length}
+\alias{length,fingerprint-method}
+\title{
+  Fingerprint Bit Length
+}
+\description{
+  Returns the length of the fingerprint. That is, this is the length of
+  the entire bit string and not simply the number of bits that are on.
+}
+\usage{
+  \S4method{length}{fingerprint}(x)
+}
+\arguments{
+    \item{x}{
+      An object of class \code{fingerprint}
+    }
+}
+\value{
+The length of the bit string
+}
+\keyword{logic}
+\keyword{methods}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+
+
+

Added: pkg/man/linefunc.Rd
===================================================================
--- pkg/man/linefunc.Rd	                        (rev 0)
+++ pkg/man/linefunc.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,35 @@
+\name{cdk.lf, moe.lf, bci.lf}
+\alias{cdk.lf}
+\alias{moe.lf}
+\alias{bci.lf}
+\title{
+    Functions to parse lines from fingerprint files
+}
+\description{
+These functions take a single line and parses it to produce 
+a vector of integers which represents the position of the 'on' bits in
+a fingerprint. This allows the user to use \code{read.fp} with arbitrary fingerprint
+files. A new file format can be handled by defining a new line parser function.
+Currently the three functions process fingerprint files obtained from the 
+CDK (\url{http://cdk.sourceforge.net}), MOE (\url{http://chemcomp.com})
+and BCI (\url{http://www.digitalchemistry.co.uk/})
+
+}
+\usage{
+    cdk.lf(line)
+    moe.lf(line)
+    bci.lf(line)
+}
+\arguments{
+    \item{line}{
+        The line to parse
+    }
+}
+\value{
+A vector of integers representing 'on' bits 
+}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+\keyword{logic}
+
+
+

Added: pkg/man/mat.Rd
===================================================================
--- pkg/man/mat.Rd	                        (rev 0)
+++ pkg/man/mat.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,40 @@
+\name{fp.to.matrix}
+\alias{fp.to.matrix}
+\title{
+  Converts a List of Fingerprints to a Matrix
+}
+\description{
+    In general, fingerprint data is read from a file or obtained via
+    calls to an external generator and the return value is a list of fingerprints.
+    This function takes the list and returns a matrix having number of rows equal to 
+    the number of fingerprints and the number of columns equal to the length of
+    the fingerprint. Each element is 1 or 0 (1's being specified by the positions
+    in each fingerprint vector)
+}
+\usage{
+fp.to.matrix(fplist)
+}
+\arguments{
+    \item{fplist}{
+        A list structure with each element being an object of class
+	\code{fingerprint}. These will can be constructed by hand or
+	read from disk via \code{\link{fp.read}}
+    }
+}
+\value{
+A matrix with dimensions equal to \code{length(fplist), bit length)}
+where bit length is a property of the fingerprint objects in the list.
+}
+\seealso{
+    \code{\link{distance}}, \code{\link{fp.read}}
+}
+\examples{
+# make fingerprint objects
+fp1 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+fp2 <- new("fingerprint", nbit=6, bits=c(1,4,5,6))
+fp3 <- new("fingerprint", nbit=6, bits=c(2,3,4,5,6))
+
+fp.to.matrix( list(fp1,fp2,fp3) )
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}

Added: pkg/man/read.Rd
===================================================================
--- pkg/man/read.Rd	                        (rev 0)
+++ pkg/man/read.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,54 @@
+\name{fp.read, fp.read.to.matrix}
+\alias{fp.read}
+\alias{fp.read.to.matrix}
+\title{
+    Functions to Read Fingerprints From Files
+}
+\description{
+\code{fp.read} reads in a set of fingerprints from a file. Fingerprint 
+output from the CDK, MOE and BCI can be handled.
+
+Each fingerprint is represented as a \code{fingerprint} object.
+\code{fp.read} returns a \code{list} structure, each element being a
+\code{fingerprint} object.
+
+\code{fp.read.to.matrix} is a utility function that reads the fingerprints directly to
+matrix form (columns are the bit positions and the rows are the objects whose fingerprints
+have been evaluated)
+}
+
+\usage{
+fp.read(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE)
+fp.read.to.matrix(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE)
+}
+\arguments{
+  \item{f}{
+    File containing the fingperprints
+  }
+  \item{size}{
+    The bit length of the fingerprints being considered
+  }
+  \item{lf}{
+    A line reading function that parses a single line from
+    a fingerprint file. Currently, three such functions are provided
+    that parse the fingerprints from the output of the CDK, MOE and the
+    BCI toolkit,  respectively.
+  }
+  \item{header}{
+    Indicates whether the first line of the fingerprint file is
+    a header line
+  }
+}
+\seealso{
+  \code{\link{cdk.lf}},
+  \code{\link{moe.lf}},
+  \code{\link{bci.lf}}
+}
+\value{
+  A \code{list} or \code{matrix} of fingerprints
+}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+\keyword{logic}
+
+
+

Added: pkg/man/rndfp.Rd
===================================================================
--- pkg/man/rndfp.Rd	                        (rev 0)
+++ pkg/man/rndfp.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,32 @@
+\name{random.fingerprint}
+\alias{random.fingerprint}
+\title{
+  Generate Randomized Fingerprints
+}
+\description{
+  A utility function that can be used to generate binary fingerprints
+  of a specified length with a specifed number of bit positions
+  (selected randomly) set to 1. Currently bit positions are selected uniformly
+}
+\usage{
+random.fingerprint(nbit,on)
+}
+\arguments{
+    \item{nbit}{
+      The length of the fingerprint, that is, the total number of bits.
+      Must be a positive integer.
+    }
+    \item{on}{
+      How many positions should be set to 1
+      }
+}
+\value{
+An object of class \code{fingerprint}
+}
+\examples{
+# make a fingerprint vector
+fp <- random.fingerprint(32, 16)
+as.character(fp)
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}

Added: pkg/man/show.Rd
===================================================================
--- pkg/man/show.Rd	                        (rev 0)
+++ pkg/man/show.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,22 @@
+\name{show}
+\alias{show,fingerprint-method}
+\title{
+    String Representation of a Fingerprint
+}
+\description{
+Simply summarize the fingerprint.
+}
+
+\usage{
+\S4method{show}{fingerprint}(object)
+}
+\arguments{
+    \item{object}{
+      An object of class \code{fingerprint}
+    }
+}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+\keyword{logic}
+
+
+

Added: pkg/man/sim.Rd
===================================================================
--- pkg/man/sim.Rd	                        (rev 0)
+++ pkg/man/sim.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,46 @@
+\name{fp.sim.matrix}
+\alias{fp.sim.matrix}
+\title{
+  Calculates a Similarity Matrix for a Set of Fingerprints
+}
+\description{
+Given a set of fingerprints, a pairwise similarity can be calculated using the
+various distance metrics defined for binary strings. This function calculates
+the pairwise similarity matrix for a set of \code{fingerprint} objectssupplied in a \code{list} 
+structure. Any of the distance metrics provided by \code{\link{distance}} can be used and the 
+default is the Tanimoto metric.
+
+Note that if the the Euclidean distance is specified then the resultant matrix is a
+distance matrix and not a similarity matrix
+}
+\usage{
+fp.sim.matrix(fplist, method='tanimoto')
+}
+\arguments{
+  \item{fplist}{
+        A list structure with each element being an object of class
+	\code{fingerprint}. These will can be constructed by hand or
+	read from disk via \code{\link{fp.read}}    
+    }
+    \item{method}{
+    The type of distance metric to use. Alternatives are \code{euclidean} and
+    \code{dice} and \code{mt}. The default is \code{tanimoto}. Partial
+    matching is supported.
+    }
+}
+\value{
+A matrix with dimensions equal to \code{(length(fplist), length(fplist))}
+}
+\seealso{
+    \code{\link{distance}}, \code{\link{fp.read}}
+}
+\examples{
+# make fingerprint objects
+fp1 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+fp2 <- new("fingerprint", nbit=6, bits=c(1,4,5,6))
+fp3 <- new("fingerprint", nbit=6, bits=c(2,3,4,5,6))
+
+fp.sim.matrix( list(fp1,fp2,fp3) )
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}

Added: pkg/man/string.Rd
===================================================================
--- pkg/man/string.Rd	                        (rev 0)
+++ pkg/man/string.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,32 @@
+\name{as.character}
+\alias{as.character}
+\alias{as.character,fingerprint-method}
+\title{
+  Generates a String Representation of a Fingerprint
+}
+\description{
+    The function returns a string of 1's and 0's corresponding to the 
+    fingerprint object supplied        
+}
+\usage{
+\S4method{as.character}{fingerprint}(x)
+}
+\arguments{
+    \item{x}{
+      An object of class \code{fingerprint}
+    }
+}
+\value{
+A string of 1's and 0's
+}
+
+\examples{
+# make a fingerprint vector
+fp <- new("fingerprint", nbit=32, bits=sample(1:32, 20))
+
+# print out the string representation
+as.character(fp)
+}
+\keyword{logic}
+\keyword{methods}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}

Added: pkg/man/vec.Rd
===================================================================
--- pkg/man/vec.Rd	                        (rev 0)
+++ pkg/man/vec.Rd	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,33 @@
+\name{euc.vector}
+\alias{euc.vector}
+\title{
+  Euclidean Representation of Binary Fingerprints
+}
+\description{
+ Ordinarily, a binary fingerprint can be considered to represent a 
+ corner of a nD hypercube. However in many cases using such a representation
+ can lead to a very sparse space. Consequently one approach is to convert
+ the fingerprint so that it represents points on a nD unit hypersphere.
+
+ The resultant fingerprint is then a nD coordinate.
+}
+\usage{
+euc.vector(fp)
+}
+\arguments{
+    \item{fp}{
+        An object of class \code{fingerprint}.
+    }
+}
+\value{
+A numeric of length equal to the bit length of the fingerprint. The
+result corresponds to a unit vector for a point
+on the nD hypersphere
+}
+\examples{
+# make a fingerprint vector
+fp <- new("fingerprint", nbit=8, bits=c(1,3,4,5,7))
+vec <- euc.vector(fp)
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}

Added: pkg/src/fpdistance.c
===================================================================
--- pkg/src/fpdistance.c	                        (rev 0)
+++ pkg/src/fpdistance.c	2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,59 @@
+#include <R.h>
+#include <R_ext/Utils.h>
+
+#define X(_m,_i,_j,_nrow) _m[ _i + _nrow * _j ]
+
+#define METRIC_TANIMOTO       1
+#define METRIC_EUCLIDEAN      2
+
+double d_tanimoto(double*,double*,int);
+double d_euclidean(double*,double*,int);
+
+/**
+fp1 and fp2 should be an array of 1's and 0's, of
+length equal to the size of the fingerprint
+**/
+void fpdistance(double *fp1, double *fp2, int *nbit, int *metric, double *ret) {
+  double r = 0.0;
+  switch(*metric) {
+  case METRIC_TANIMOTO:
+    r = d_tanimoto(fp1, fp2, *nbit);
+    break;
+  case METRIC_EUCLIDEAN:
+    r = d_euclidean(fp1, fp2, *nbit);
+  }
+  *ret = r;
+  return;
+}
+
+/**
+http://www.daylight.com/dayhtml/doc/theory/theory.finger.html
+**/
+double d_tanimoto(double *fp1, double *fp2, int nbit) {
+  int i,j;
+  int nc = 0;
+  int na = 0;
+  int nb = 0;
+  if (nbit <= 0) return(-1.0);
+  for (i = 0; i < nbit; i++) {
+    if (fp1[i] == 1 && fp2[i] == 1) nc++;
+    if (fp1[i] == 1 && fp2[i] == 0) na++;
+    if (fp2[i] == 1 && fp1[i] == 0) nb++;
+  }
+  return ((double) nc) / (double) (na + nb + nc);
+}
+
+/**
+http://www.daylight.com/dayhtml/doc/theory/theory.finger.html
+**/
+double d_euclidean(double *fp1, double *fp2, int nbit) {
+  int i,j;
+  int nc = 0;
+  int nd = 0;
+  if (nbit <= 0) return(-1.0);
+  for (i = 0; i < nbit; i++) {
+    if (fp1[i] == 1 && fp2[i] == 1) nc++;
+    if (fp1[i] == 0 && fp2[i] == 0) nd++;
+  }
+  return sqrt(((double) nc + (double) nd) / (double) nbit);
+}



More information about the Fingerprint-commits mailing list