[Fingerprint-commits] r2 - in pkg: . R man src
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Fri Oct 24 23:02:21 CEST 2008
Author: rajarshi
Date: 2008-10-24 23:02:21 +0200 (Fri, 24 Oct 2008)
New Revision: 2
Added:
pkg/DESCRIPTION
pkg/INDEX
pkg/NAMESPACE
pkg/R/bitspec.R
pkg/R/fingerprint.R
pkg/R/matrix.R
pkg/R/misc.R
pkg/R/ops.R
pkg/R/read.R
pkg/R/zzz.R
pkg/README
pkg/man/bitspec.Rd
pkg/man/distance.Rd
pkg/man/facmat.Rd
pkg/man/fingerprint.Rd
pkg/man/fold.Rd
pkg/man/fplogical.Rd
pkg/man/length.Rd
pkg/man/linefunc.Rd
pkg/man/mat.Rd
pkg/man/read.Rd
pkg/man/rndfp.Rd
pkg/man/show.Rd
pkg/man/sim.Rd
pkg/man/string.Rd
pkg/man/vec.Rd
pkg/src/
pkg/src/fpdistance.c
Log:
Added all the sources from my local repo. Unfortunately, lost the history
Added: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION (rev 0)
+++ pkg/DESCRIPTION 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,19 @@
+Package: fingerprint
+Version: 3.1
+Date: 2008-10-17
+Title: Functions to operate on binary fingerprint data
+Author: Rajarshi Guha <rguha at indiana.edu>
+Maintainer: Rajarshi Guha <rguha at indiana.edu>
+Description: This package contains functions to manipulate binary fingerprints
+ of arbitrary length. A fingerprint is represented by an object of S4 class 'fingerprint'
+ which is internally represented a vector of integers, such
+ that each element represents the position in the fingerprint that is set to 1.
+ The bitwise logical functions in R are overridden so that they can be used directly
+ with 'fingerprint' objects. A number of distance metrics are also
+ available (many contributed by Michael Fadock). Fingerprints
+ can be converted to Euclidean vectors (i.e., points on the unit hypersphere) and
+ can also be folded using OR. Arbitrary fingerprint formats can be handled via line
+ handlers. Currently handlers are provided for CDK, MOE and BCI fingerprint data.
+License: GPL
+Depends: methods
+LazyLoad: yes
Added: pkg/INDEX
===================================================================
--- pkg/INDEX (rev 0)
+++ pkg/INDEX 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,19 @@
+! Logical Operators for Fingerprints
+as.character Generates a String Representation of a
+ Fingerprint
+cdk.lf Functions to parse lines from fingerprint files
+distance Calculates the Similarity or Dissimilarity
+ Between Two Fingerprints
+euc.vector Euclidean Representation of Binary Fingerprints
+fingerprint-class Class "fingerpint"
+fold Fold a fingerprint
+fp.factor.matrix Converts a List of Fingerprints to a data.frame
+ of Factors
+fp.read Functions to Read Fingerprints From Files
+fp.sim.matrix Calculates a Similarity Matrix for a Set of
+ Fingerprints
+fp.to.matrix Converts a List of Fingerprints to a Matrix
+length Fingerprint Bit Length
+random.fingerprint Generate Randomized Fingerprints
+show,fingerprint-method
+ String Representation of a Fingerprint
Added: pkg/NAMESPACE
===================================================================
--- pkg/NAMESPACE (rev 0)
+++ pkg/NAMESPACE 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,5 @@
+importFrom("methods")
+exportClasses("fingerprint")
+exportMethods("fold", "euc.vector", "distance", "random.fingerprint", "as.character", "length", "show")
+export("fp.sim.matrix", "fp.to.matrix", "fp.factor.matrix", "fp.read.to.matrix", "fp.read", "moe.lf", "bci.lf", "cdk.lf", "bit.spectrum")
+useDynLib(fingerprint)
Added: pkg/R/bitspec.R
===================================================================
--- pkg/R/bitspec.R (rev 0)
+++ pkg/R/bitspec.R 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,12 @@
+bit.spectrum <- function(fplist) {
+ if (class(fplist) != 'list') stop("Must provide a list of fingerprint objects")
+ if (any(unlist(lapply(fplist, class)) != 'fingerprint'))
+ stop("Must provide a list of fingerprint objects");
+ nbit <- length(fplist[[1]])
+ spec <- numeric(nbit)
+ for (i in 1:length(fplist)) {
+ bits <- fplist[[i]]@bits
+ spec[bits] <- spec[bits]+1
+ }
+ spec / length(fplist)
+}
Added: pkg/R/fingerprint.R
===================================================================
--- pkg/R/fingerprint.R (rev 0)
+++ pkg/R/fingerprint.R 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,69 @@
+setClass("fingerprint",
+ representation(bits="numeric",
+ nbit="numeric",
+ folded="logical",
+ provider="character",
+ name="character"),
+ validity=function(object) {
+ if (any(object at bits > object at nbit))
+ return("Bit positions were greater than the specified bit length")
+ else return(TRUE)
+ },
+ prototype(bits=c(),
+ nbit=0,
+ folded=FALSE,
+ provider="",
+ name=""))
+
+#setGeneric("show", function(object) standardGeneric("show"))
+setMethod("show", "fingerprint",
+ function(object) {
+ cat("Fingerprint object\n")
+ cat(" name = ", object at name, "\n")
+ cat(" length = ", object at nbit, "\n")
+ cat(" folded = ", object at folded, "\n")
+ cat(" source = ", object at provider, "\n")
+ cat(" bits on = ", paste(sort(object at bits), collapse=' '), "\n")
+ })
+
+
+setMethod('as.character', "fingerprint",
+ function(x) {
+ s <- numeric(x at nbit)
+ s[x at bits] <- 1
+ paste(s,sep='',collapse='')
+ })
+
+setMethod("length", "fingerprint",
+ function(x) {
+ x at nbit
+ })
+
+parseCall <- function (obj)
+{
+ if (class(obj) != "call") {
+ stop("Must supply a 'call' object")
+ }
+ srep <- deparse(obj)
+ if (length(srep) > 1)
+ srep <- paste(srep, sep = "", collapse = "")
+ fname <- unlist(strsplit(srep, "\\("))[1]
+ func <- unlist(strsplit(srep, paste(fname, "\\(", sep = "")))[2]
+ func <- unlist(strsplit(func, ""))
+ func <- paste(func[-length(func)], sep = "", collapse = "")
+ func <- unlist(strsplit(func, ","))
+ vals <- list()
+ nms <- c()
+ cnt <- 1
+ for (args in func) {
+ arg <- unlist(strsplit(args, "="))[1]
+ val <- unlist(strsplit(args, "="))[2]
+ arg <- gsub(" ", "", arg)
+ val <- gsub(" ", "", val)
+ vals[[cnt]] <- val
+ nms[cnt] <- arg
+ cnt <- cnt + 1
+ }
+ names(vals) <- nms
+ vals
+}
Added: pkg/R/matrix.R
===================================================================
--- pkg/R/matrix.R (rev 0)
+++ pkg/R/matrix.R 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,32 @@
+fp.sim.matrix <- function(fplist, method='tanimoto') {
+ size <- fplist[[1]]@nbit
+
+ sim <- matrix(0,nr=length(fplist), nc=length(fplist))
+ for (i in 1:(length(fplist)-1)) {
+ v <- unlist(lapply( fplist[(i+1):length(fplist)], distance, fp2=fplist[[i]], method=method))
+ sim[i,(i+1):length(fplist)] <- v
+ sim[(i+1):length(fplist),i] <- v
+ }
+ diag(sim) <- 1.0
+ sim
+}
+
+## Takes the fingerprints, P bits, for a set of N molecules supplied as
+## a list structure and creates an N x P matrix
+fp.to.matrix <- function( fplist ) {
+ size <- fplist[[1]]@nbit
+ m <- matrix(0, nr=length(fplist), nc=size)
+ cnt <- 1
+ for ( i in fplist ) {
+ m[cnt,i at bits] <- 1
+ cnt <- cnt + 1
+ }
+ m
+}
+
+fp.factor.matrix <- function( fplist ) {
+ size <- fplist[[1]]@nbit
+ m <- data.frame(fp.to.matrix(fplist))
+ m[] <- lapply(m, factor, levels=0:1)
+ m
+}
Added: pkg/R/misc.R
===================================================================
--- pkg/R/misc.R (rev 0)
+++ pkg/R/misc.R 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,188 @@
+setGeneric("fold", function(fp) standardGeneric("fold"))
+setMethod("fold", "fingerprint",
+ function(fp) {
+ size <- fp at nbit
+ if (size %% 2 != 0) {
+ stop('Need to supply a fingerprint of even numbered length')
+ }
+ bfp <- rep(FALSE, size)
+ bfp[fp at bits] <- TRUE
+
+ subfplen <- size/2
+
+ b1 <- which(bfp[1:subfplen])
+ b2 <- which(bfp[(subfplen+1):size])
+
+ subfp1 <- new("fingerprint",
+ nbit=subfplen,
+ bits=b1,
+ provider="R");
+
+ subfp2 <- new("fingerprint",
+ nbit=subfplen,
+ bits=b2,
+ provider="R")
+ foldedfp <- subfp1 | subfp2
+ foldedfp at folded <- TRUE
+ return(foldedfp)
+ })
+
+setGeneric("euc.vector", function(fp) standardGeneric("euc.vector"))
+setMethod("euc.vector", "fingerprint",
+ function(fp) {
+ coord <- rep(0,length(fp))
+ coord[fp at bits] <- 1.0 / sqrt(length(fp))
+ coord
+ })
+
+
+setGeneric("distance", function(fp1,fp2,method) standardGeneric("distance"))
+setMethod("distance", c("fingerprint", "fingerprint", "missing"),
+ function(fp1,fp2) {
+ distance(fp1,fp2,"tanimoto")
+ })
+setMethod("distance", c("fingerprint", "fingerprint", "character"),
+ function(fp1,fp2, method=c('tanimoto', 'euclidean', 'mt',
+ 'simple', 'jaccard', 'dice',
+ 'russelrao', 'rodgerstanimoto','cosine',
+ 'achiai', 'carbo', 'baroniurbanibuser',
+ 'kulczynski2',
+
+ 'hamming', 'meanHamming', 'soergel',
+ 'patternDifference', 'variance', 'size', 'shape',
+
+ 'hamann', 'yule', 'pearson', 'dispersion',
+ 'mcconnaughey', 'stiles',
+
+ 'simpson', 'petke',
+ 'stanimoto', 'seuclidean'
+ )) {
+
+ if ( length(fp1) != length(fp2))
+ stop("Fingerprints must of the same bit length")
+
+ method <- match.arg(method)
+ n <- length(fp1)
+
+ if (method == 'tanimoto') {
+ f1 <- numeric(n)
+ f2 <- numeric(n)
+ f1[fp1 at bits] <- 1
+ f2[fp2 at bits] <- 1
+ sim <- 0.0
+ ret <- .C("fpdistance", as.double(f1), as.double(f2),
+ as.integer(n), as.integer(1),
+ as.double(sim),
+ PACKAGE="fingerprint")
+ return (ret[[5]])
+ } else if (method == 'euclidean') {
+ f1 <- numeric(n)
+ f2 <- numeric(n)
+ f1[fp1 at bits] <- 1
+ f2[fp2 at bits] <- 1
+ sim <- 0.0
+ ret <- .C("fpdistance", as.double(f1), as.double(f1),
+ as.integer(n), as.integer(2),
+ as.double(sim),
+ PACKAGE="fingerprint")
+ return (ret[[5]])
+ }
+
+ size <- n
+
+ ## in A & B
+ tmp <- fp1 & fp2
+ c <- length(tmp at bits)
+
+ ## in A not in B
+ tmp <- (fp1 | fp2) & !fp2
+ a <- length(tmp at bits)
+
+ ## in B not in A
+ tmp <- (fp1 | fp2) & !fp1
+ b <- length(tmp at bits)
+
+ ## not in A, not in B
+ tmp <- !(fp1 | fp2)
+ d <- length(tmp at bits)
+
+ dist <- NULL
+
+ ## Simlarity
+ if (method == 'stanimoto') {
+ dist <- c / (a+b+c)
+ } else if (method == 'seuclidean') {
+ dist <- sqrt((d+c) / (a+b+c+d))
+ } else if (method == 'dice') {
+ dist <- c / (.5*a + .5*b + c)
+ } else if (method == 'mt') {
+ t1 <- c/(size-d)
+ t0 <- d/(size-c)
+ phat <- ((size-d) + c)/(2*size)
+ dist <- (2-phat)*t1/3 + (1+phat)*t0/3
+ } else if (method == 'simple') {
+ dist <- (c+d)/n
+ } else if (method == 'jaccard') {
+ dist <- c/(a+b+c)
+ } else if (method == 'russelrao') {
+ dist <- c/size
+ } else if (method == 'rodgerstanimoto') {
+ dist <- (c+d)/(2*a+2*b+c+d)
+ } else if (method == 'cosine' || method == 'achiai' || method == 'carbo') {
+ dist <- c/sqrt((a+c)*(b+c))
+ } else if (method == 'baroniurbanibuser') {
+ dist <- (sqrt(c*d)+c)/(sqrt(c*d)+a+b+c)
+ } else if (method == 'kulczynski2') {
+ dist <- .5*(c/(a+c)+c/(b+c))
+ }
+ ## Dissimilarity
+ else if (method == 'hamming') {
+ dist <- a+b
+ } else if (method == 'meanHamming') {
+ dist <- (a+b)/(a+b+c+d)
+ }else if (method == 'soergel') {
+ dist <- (a+b)/(a+b+c)
+ } else if (method == 'patternDifference') {
+ dist <- (a*b)/(a+b+c+d)^2
+ } else if (method == 'variance') {
+ dist <- (a+b)/(4*n)
+ } else if (method == 'size') {
+ dist <- (a-b)^2/n^2
+ } else if (method == 'shape') {
+ dist <- (a+b)/n-((a-b)/(n))^2
+ }
+
+ ## Composite
+ else if (method == 'hamann') {
+ dist <- (c+d-a-b)/(a+b+c+d)
+ } else if (method == 'yule') {
+ dist <- (c*d-a*b)/(c*d+a*b)
+ } else if (method == 'pearson') {
+ dist <- (c*d-a*b)/sqrt((a+c)*(b+c)*(a+d)*(b+d))
+ } else if (method == 'dispersion') {
+ dist <- (c*d-a*b)/n^2
+ } else if (method == 'mcconaughey') {
+ dist <- (c^2-a*b)/((a+c)*(b+c))
+ } else if (method == 'stiles') {
+ dist <- log10(n*(abs(c*d-a*b)-n/2)^2/((a+c)*(b+c)*(a+d)*(b+d)))
+ }
+
+ ## Asymmetric
+ else if (method == 'simpson') {
+ dist <- c/min((a+c),(b+c))
+ } else if (method == 'petke') {
+ dist <- c/max((a+c),(b+c))
+ }
+
+ dist
+ })
+
+setGeneric("random.fingerprint",
+ function(nbit, on) standardGeneric("random.fingerprint"))
+setMethod("random.fingerprint", c("numeric", "numeric"),
+ function(nbit, on) {
+ if (nbit <= 0) stop("Bit length must be positive integer")
+ if (on <= 0) stop("Number of bits to be set to 1 must be positive integer")
+ bits <- sample(1:nbit, size=on)
+ new("fingerprint", nbit=nbit, bits=bits, provider="R", folded=FALSE)
+ })
Added: pkg/R/ops.R
===================================================================
--- pkg/R/ops.R (rev 0)
+++ pkg/R/ops.R 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,54 @@
+setMethod("&", c("fingerprint", "fingerprint"),
+ function(e1, e2) {
+ if (e1 at nbit != e2 at nbit)
+ stop("fp1 & fp2 must of the same bit length")
+
+ andbits <- intersect(e1 at bits, e2 at bits)
+ new("fingerprint",
+ bits=andbits,
+ nbit=e1 at nbit,
+ provider="R")
+ })
+
+setMethod("|", c("fingerprint", "fingerprint"),
+ function(e1, e2) {
+ if (e1 at nbit != e2 at nbit)
+ stop("fp1 & fp2 must of the same bit length")
+
+ orbits <- union(e1 at bits, e2 at bits)
+ new("fingerprint",
+ bits=orbits,
+ nbit=e1 at nbit,
+ provider="R")
+ })
+
+setMethod("!", c("fingerprint"),
+ function(x) {
+ bs <- 1:(x at nbit)
+ if (length(x at bits) > 0) b <- bs[ -x at bits ]
+ else b <- bs
+ ret <- new("fingerprint",
+ bits=b,
+ nbit=x at nbit,
+ provider="R")
+ return(ret)
+ })
+
+setMethod("xor", c("fingerprint", "fingerprint"),
+ function(x,y) {
+ if (x at nbit != y at nbit)
+ stop("e1 & e2 must of the same bit length")
+
+ tmp1 <- rep(FALSE, x at nbit)
+ tmp2 <- rep(FALSE, y at nbit)
+ tmp1[x at bits] <- TRUE
+ tmp2[y at bits] <- TRUE
+ tmp3 <- xor(tmp1,tmp2)
+ xorbits <- which(tmp3)
+
+ new("fingerprint",
+ bits=xorbits,
+ nbit=x at nbit,
+ provider="R")
+ })
+
Added: pkg/R/read.R
===================================================================
--- pkg/R/read.R (rev 0)
+++ pkg/R/read.R 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,53 @@
+cdk.lf <- function(line) {
+ p <- regexpr("{([0-9,\\s]*)}",line,perl=T)
+ s <- gsub(',','',substr(line, p+1, p+attr(p,"match.length")-2))
+ s <- lapply( strsplit(s,' '), as.numeric )
+ list(NA, s[[1]])
+}
+
+moe.lf <- function(line) {
+ p <- regexpr("\"([0-9\\s]*)\"",line, perl=T)
+ s <- substr(line, p+1, p+attr(p,"match.length")-2)
+ s <- lapply( strsplit(s,' '), as.numeric )
+ list(NA, s[[1]])
+}
+
+bci.lf <- function(line) {
+ tokens <- strsplit(line, '\\s')[[1]]
+ name <- tokens[1]
+ tokens <- tokens[-c(1, length(tokens), length(tokens)-1)]
+ list(name, as.numeric(tokens))
+}
+
+fp.read <- function(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE) {
+ provider <- parseCall(match.call())$lf
+
+ fplist <- list()
+ fcon <- file(description=f,open='r')
+ lines = readLines(fcon,n=-1)
+ if (header) lines = lines[-1]
+ c = 1
+ for (line in lines) {
+ dat <- lf(line)
+ if (is.na(dat[[1]])) name <- ""
+ else name <- dat[[1]]
+
+ fplist[[c]] <- new("fingerprint",
+ nbit=size,
+ bits=as.numeric(dat[[2]]),
+ folded=FALSE,
+ provider=provider,
+ name=name)
+ c <- c+1
+ }
+ close(fcon)
+ fplist
+}
+
+# Need to supply the length of the bit string since fp.read does
+# not provide that information
+fp.read.to.matrix <- function(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE) {
+ fplist <- fp.read(f, size, lf, header)
+ fpmat <- fp.to.matrix(fplist)
+ fpmat
+ }
Added: pkg/R/zzz.R
===================================================================
--- pkg/R/zzz.R (rev 0)
+++ pkg/R/zzz.R 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1 @@
+.onLoad <- function(lib, pkg) require(methods)
Added: pkg/README
===================================================================
--- pkg/README (rev 0)
+++ pkg/README 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,10 @@
+1. Put any C/C++/Fortran code in 'src'
+2. If you have compiled code, add a .First.lib() function in 'R'
+ to load the shared library
+3. Edit the help file skeletons in 'man'
+4. Run R CMD build to create the index files
+5. Run R CMD check to check the package
+6. Run R CMD build to make the package file
+
+
+Read "Writing R Extensions" for more information.
Added: pkg/man/bitspec.Rd
===================================================================
--- pkg/man/bitspec.Rd (rev 0)
+++ pkg/man/bitspec.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,46 @@
+\name{bit.spectrum}
+\alias{bit.spectrum}
+\title{
+ Generate a Bit Spectrum from a List of Fingerprints
+}
+\description{
+The idea of comparing datasets using fingerprints was described in
+Guha \& Schurer (2008). The idea is that one can summarize the dataset
+by counting the frequency of occurrence of each bit position. The
+frequency is normalized by the number of fingerprints considered. Thus a
+collection of N fingerprints can be converted to a single vector of
+numbers highlighting the most frequent bits with respect to a given
+dataset. A plot of this vector looks like a traditional spectrum and
+hence the name.
+
+The bit spectra for two datasets (assuming that the same types of
+fingerprints have been used) allows one to compare the similarity of
+the datasets, without having to do a full pairwise similarity
+calculation. The difference between the structural features of the
+datasets can be quantified by evaluating the distance between the two
+bit spectra.
+}
+\usage{
+bit.spectrum(fplist)
+}
+\arguments{
+ \item{fplist}{
+ A list structure with each element being an object of class
+ \code{fingerprint}. These will can be constructed by hand or
+ read from disk via \code{\link{fp.read}}.
+
+ All fingerprints in the list should be of the same length.
+ }
+}
+\value{
+A numeric vector of length equal to the size of the fingerprints.
+}
+\seealso{
+ \code{\link{distance}}, \code{\link{fp.read}}
+}
+\references{
+Guha, R.; Schurer, S.; \emph{J. Comp. Aid. Molec. Des.}, \bold{2008},
+ \emph{22}, 367-384.
+}
+\keyword{programming}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
Added: pkg/man/distance.Rd
===================================================================
--- pkg/man/distance.Rd (rev 0)
+++ pkg/man/distance.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,129 @@
+\name{distance}
+\alias{distance}
+\title{
+Calculates the Similarity or Dissimilarity Between Two Fingerprints
+}
+\description{
+ A number of distance metrics can be calculated for binary
+ fingerprints. Some of these are actually similarity metrics and
+ thus represent the reverse of a distance metric.
+
+ The following are distance (dissimilarity) metrics
+ \itemize{
+ \item Hamming
+ \item Mean Hamming
+ \item Soergel
+ \item Pattern Difference
+ \item Variance
+ \item Size
+ \item Shape
+ }
+
+ The following metrics are similarity metrics and so the distance can
+ be obtained by subtracting the value fom 1.0
+ \itemize{
+ \item Tanimoto
+ \item Dice
+ \item Modified Tanimoto
+ \item Simple
+ \item Jaccard
+ \item Russel-Rao
+ \item Rodgers Tanimoto
+ \item Cosine
+ \item Achiai
+ \item Carbo
+ \item Baroniurbanibuser
+ \item Kulczynski2
+ }
+
+ Finally the method also provides a set of composite and asymmetric
+ distance metrics
+ \itemize{
+ \item Hamann
+ \item Yule
+ \item Pearson
+ \item Dispersion
+ \item McConnaughey
+ \item Stiles
+ \item Simpson
+ \item Petke
+ }
+ The default metric is the Tanimoto coefficient.
+}
+\usage{
+distance(fp1, fp2, method)
+}
+\arguments{
+ \item{fp1}{
+ An object of class \code{fingerprint}
+ }
+ \item{fp2}{
+ An object of class \code{fingerprint}
+ }
+ \item{method}{
+ The type of distance metric desired. Partial matching is
+ supported and the deault is \code{tanimoto}. Alternative values are
+ \itemize{
+ \item \code{euclidean}
+ \item \code{hamming}
+ \item \code{meanHamming}
+ \item \code{soergel}
+ \item \code{patternDifference}
+ \item \code{variance}
+ \item \code{size}
+ \item \code{shape}
+
+ \item \code{jaccard}
+ \item \code{dice}
+ \item \code{mt}
+ \item \code{simple}
+ \item \code{russelrao}
+ \item \code{rodgerstanimoto}
+ \item \code{cosine}
+ \item \code{achiai}
+ \item \code{carbo}
+ \item \code{baroniurbanibuser}
+ \item \code{kulczynski2}
+
+ \item \code{hamann}
+ \item \code{yule}
+ \item \code{pearson}
+ \item \code{mcconnaughey}
+ \item \code{stiles}
+
+ \item \code{simpson}
+ \item \code{petke}
+
+ }
+
+ }
+}
+\value{
+ Numeric value representing the distance in the specified metric between the
+ supplied fingerprint objects
+}
+\examples{
+# make a 2 fingerprint vectors
+fp1 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+fp2 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+
+# calculate the tanimoto coefficient
+distance(fp1,fp2) # should be 1
+
+# Invert the second fingerprint
+fp3 <- !fp2
+
+distance(fp1,fp3) # should be 0
+}
+
+\references{Fligner, M.A.; Verducci, J.S.; Blower, P.E.;
+ A Modification of the Jaccard-Tanimoto Similarity Index for
+ Diverse Selection of Chemical Compounds Using Binary Strings,
+ \emph{Technometrics}, 2002, \emph{44}(2), 110-119
+
+ Monve, V.; Introduction to Similarity Searching in
+ Chemistry, \emph{MATCH - Comm. Math. Comp. Chem.}, 2004, \emph{51}, 7-38
+}
+
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
Added: pkg/man/facmat.Rd
===================================================================
--- pkg/man/facmat.Rd (rev 0)
+++ pkg/man/facmat.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,35 @@
+\name{fp.factor.matrix}
+\alias{fp.factor.matrix}
+\title{
+ Converts a List of Fingerprints to a data.frame of Factors
+}
+\description{
+This function will convert a \code{list} of fingerprint objects
+to a \code{data.frame} of factors with levels 1 and 0.
+}
+\usage{
+fp.factor.matrix(fplist)
+}
+\arguments{
+ \item{fplist}{
+ A list structure with each element being an object of class
+ \code{fingerprint}. These will can be constructed by hand or
+ read from disk via \code{\link{fp.read}}
+ }
+}
+\value{
+A matrix with dimensions equal to \code{(length(fplist), length(fplist))}
+}
+\seealso{
+ \code{\link{distance}}, \code{\link{fp.read}}
+}
+\examples{
+# make fingerprint objects
+fp1 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+fp2 <- new("fingerprint", nbit=6, bits=c(1,4,5,6))
+fp3 <- new("fingerprint", nbit=6, bits=c(2,3,4,5,6))
+
+fp.factor.matrix( list(fp1,fp2,fp3) )
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
Added: pkg/man/fingerprint.Rd
===================================================================
--- pkg/man/fingerprint.Rd (rev 0)
+++ pkg/man/fingerprint.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,77 @@
+\name{fingerprint-class}
+\docType{class}
+\alias{fingerprint-class}
+\alias{distance,fingerprint,fingerprint,missing-method}
+\alias{distance,fingerprint,fingerprint,character-method}
+\alias{euc.vector,fingerprint-method}
+\alias{fold,fingerprint-method}
+\alias{random.fingerprint,numeric,numeric-method}
+
+\title{Class "fingerpint"}
+\description{This class represents binary fingerprints, usually
+ generated by a variety of cheminformatics software, but not
+ restricted to such
+ }
+\section{Objects from the Class}{
+Objects can be created by calls of the form \code{new("fingerprint", ...)}.
+ Fingerprints can traditionally thought of as a vector of 1's and
+ 0's. However for large fingerprints this is inefficient and
+ instead we simply store the positions of the bits that are
+ on. Certain operations also need to know the length of the
+ original bit string and this length is stored in the object at
+ construction. Even though we store extra information along with
+ the bit positions, conceptually we still consider the objects as
+ simple bit strings. Thus the usual bitwise logical operations
+ (&, |, !, xor) can be applied to objects of this class.
+}
+\section{Slots}{
+ \describe{
+ \item{\code{bits}:}{Object of class \code{"numeric"} ~~ A vector
+ indicating the bit positions that are on. }
+ \item{\code{nbit}:}{Object of class \code{"numeric"} ~~ Indicates the length of the original bit string.}
+ \item{\code{folded}:}{Object of class \code{"logical"} ~~ Indicates
+ whether the fingerprint has been folded.}
+ \item{\code{provider}:}{Object of class \code{"character"} ~~
+ Indicates the source of the fingerprint. Can be useful to keep
+ track of what software generated the fingerprint.}
+ \item{\code{name}:}{Object of class \code{"character"} ~~
+ The name associated with the fingerprint. If not name is available
+ this gets set to an empty string}
+ }
+}
+\section{Methods}{
+ \describe{
+ \item{distance}{\code{signature(fp1 = "fingerprint", fp2 = "fingerprint", method = "missing")}: ... }
+ \item{distance}{\code{signature(fp1 = "fingerprint", fp2 = "fingerprint", method = "character")}: ... }
+ \item{euc.vector}{\code{signature(fp = "fingerprint")}: ... }
+ \item{fold}{\code{signature(fp = "fingerprint")}: ... }
+ \item{random.fingerprint}{\code{signature(nbit = "numeric", on = "numeric")}: ... }
+ }
+}
+\references{}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+\seealso{
+ \code{\link{fp.read}}, \code{\link{fp.read.to.matrix}}
+ \code{\link{fp.sim.matrix}}, \code{\link{fp.to.matrix}},
+ \code{\link{fp.factor.matrix}}
+ \code{\link{random.fingerprint}}
+ }
+
+\examples{
+## make fingerprints
+x <- new("fingerprint", nbit=128, bits=sample(1:128, 100))
+y <- x
+distance(x,y) # should be 1
+x <- new("fingerprint", nbit=128, bits=sample(1:128, 100))
+distance(x,y)
+folded <- fold(x)
+
+## binary operations on fingerprints
+x <- new("fingerprint", nbit=8, bits=c(1,2,3,6,8))
+y <- new("fingerprint", nbit=8, bits=c(1,2,4,5,7,8))
+x & y
+x | y
+!x
+}
+\keyword{classes}
+\keyword{logic}
\ No newline at end of file
Added: pkg/man/fold.Rd
===================================================================
--- pkg/man/fold.Rd (rev 0)
+++ pkg/man/fold.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,33 @@
+
+\name{fold}
+\alias{fold}
+\title{
+ Fold a fingerprint
+}
+\description{
+ In many situations a fingerprint is generated using a large length (such as 1024 bits or more).
+ As a result of this, the fingerprints for a dataset can be very sparse. One approach to increasing
+ bit density of such fingerprints is to fold them. This is performed by dividing the original
+ fingerprint bitstring into two substrings of equal length and then perform an OR on
+ the two substrings.
+
+ It should be noted that many fingerprint generating routines will perform this internally.
+}
+\usage{
+fold(fp)
+}
+\arguments{
+ \item{fp}{
+ The fingerprint to fold. Should be of class \code{fingerprint}.
+ }
+}
+\value{
+An object of class \code{fingerprint} representing the folded fingerprint.
+}
+\examples{
+# make a fingerprint vector
+fp <- new("fingerprint", nbit=64, bits=sample(1:64, 30))
+fold(fp)
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
Added: pkg/man/fplogical.Rd
===================================================================
--- pkg/man/fplogical.Rd (rev 0)
+++ pkg/man/fplogical.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,39 @@
+\name{!,&,|}
+\alias{!}
+\alias{|}
+\alias{&}
+\alias{xor}
+\alias{|,fingerprint,fingerprint-method}
+\alias{&,fingerprint,fingerprint-method}
+\alias{xor,fingerprint,fingerprint-method}
+\alias{!,fingerprint-method}
+\title{
+ Logical Operators for Fingerprints
+}
+\description{
+These functions perform logical operatiosn (AND, OR, NOT, XOR) on the supplied
+binary fingerprints. Thus for two fingerprints A and B we have
+\describe{
+\item{\code{&}}{Logical AND}
+\item{\code{|}}{Logical OR}
+\item{\code{xor}}{Logical XOR}
+\item{\code{!}}{Logical NOT (negation)}
+}
+}
+\arguments{
+ \item{e1}{
+ An object of class \code{fingerprint}
+ }
+ \item{e2}{
+ An object of class \code{fingerprint}
+ }
+ }
+\value{
+A fingerprint object
+}
+\keyword{logic}
+\keyword{methods}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+
+
+
Added: pkg/man/length.Rd
===================================================================
--- pkg/man/length.Rd (rev 0)
+++ pkg/man/length.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,27 @@
+\name{length}
+\alias{length}
+\alias{length,fingerprint-method}
+\title{
+ Fingerprint Bit Length
+}
+\description{
+ Returns the length of the fingerprint. That is, this is the length of
+ the entire bit string and not simply the number of bits that are on.
+}
+\usage{
+ \S4method{length}{fingerprint}(x)
+}
+\arguments{
+ \item{x}{
+ An object of class \code{fingerprint}
+ }
+}
+\value{
+The length of the bit string
+}
+\keyword{logic}
+\keyword{methods}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+
+
+
Added: pkg/man/linefunc.Rd
===================================================================
--- pkg/man/linefunc.Rd (rev 0)
+++ pkg/man/linefunc.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,35 @@
+\name{cdk.lf, moe.lf, bci.lf}
+\alias{cdk.lf}
+\alias{moe.lf}
+\alias{bci.lf}
+\title{
+ Functions to parse lines from fingerprint files
+}
+\description{
+These functions take a single line and parses it to produce
+a vector of integers which represents the position of the 'on' bits in
+a fingerprint. This allows the user to use \code{read.fp} with arbitrary fingerprint
+files. A new file format can be handled by defining a new line parser function.
+Currently the three functions process fingerprint files obtained from the
+CDK (\url{http://cdk.sourceforge.net}), MOE (\url{http://chemcomp.com})
+and BCI (\url{http://www.digitalchemistry.co.uk/})
+
+}
+\usage{
+ cdk.lf(line)
+ moe.lf(line)
+ bci.lf(line)
+}
+\arguments{
+ \item{line}{
+ The line to parse
+ }
+}
+\value{
+A vector of integers representing 'on' bits
+}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+\keyword{logic}
+
+
+
Added: pkg/man/mat.Rd
===================================================================
--- pkg/man/mat.Rd (rev 0)
+++ pkg/man/mat.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,40 @@
+\name{fp.to.matrix}
+\alias{fp.to.matrix}
+\title{
+ Converts a List of Fingerprints to a Matrix
+}
+\description{
+ In general, fingerprint data is read from a file or obtained via
+ calls to an external generator and the return value is a list of fingerprints.
+ This function takes the list and returns a matrix having number of rows equal to
+ the number of fingerprints and the number of columns equal to the length of
+ the fingerprint. Each element is 1 or 0 (1's being specified by the positions
+ in each fingerprint vector)
+}
+\usage{
+fp.to.matrix(fplist)
+}
+\arguments{
+ \item{fplist}{
+ A list structure with each element being an object of class
+ \code{fingerprint}. These will can be constructed by hand or
+ read from disk via \code{\link{fp.read}}
+ }
+}
+\value{
+A matrix with dimensions equal to \code{length(fplist), bit length)}
+where bit length is a property of the fingerprint objects in the list.
+}
+\seealso{
+ \code{\link{distance}}, \code{\link{fp.read}}
+}
+\examples{
+# make fingerprint objects
+fp1 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+fp2 <- new("fingerprint", nbit=6, bits=c(1,4,5,6))
+fp3 <- new("fingerprint", nbit=6, bits=c(2,3,4,5,6))
+
+fp.to.matrix( list(fp1,fp2,fp3) )
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
Added: pkg/man/read.Rd
===================================================================
--- pkg/man/read.Rd (rev 0)
+++ pkg/man/read.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,54 @@
+\name{fp.read, fp.read.to.matrix}
+\alias{fp.read}
+\alias{fp.read.to.matrix}
+\title{
+ Functions to Read Fingerprints From Files
+}
+\description{
+\code{fp.read} reads in a set of fingerprints from a file. Fingerprint
+output from the CDK, MOE and BCI can be handled.
+
+Each fingerprint is represented as a \code{fingerprint} object.
+\code{fp.read} returns a \code{list} structure, each element being a
+\code{fingerprint} object.
+
+\code{fp.read.to.matrix} is a utility function that reads the fingerprints directly to
+matrix form (columns are the bit positions and the rows are the objects whose fingerprints
+have been evaluated)
+}
+
+\usage{
+fp.read(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE)
+fp.read.to.matrix(f='fingerprint.txt', size=1024, lf=cdk.lf, header=FALSE)
+}
+\arguments{
+ \item{f}{
+ File containing the fingperprints
+ }
+ \item{size}{
+ The bit length of the fingerprints being considered
+ }
+ \item{lf}{
+ A line reading function that parses a single line from
+ a fingerprint file. Currently, three such functions are provided
+ that parse the fingerprints from the output of the CDK, MOE and the
+ BCI toolkit, respectively.
+ }
+ \item{header}{
+ Indicates whether the first line of the fingerprint file is
+ a header line
+ }
+}
+\seealso{
+ \code{\link{cdk.lf}},
+ \code{\link{moe.lf}},
+ \code{\link{bci.lf}}
+}
+\value{
+ A \code{list} or \code{matrix} of fingerprints
+}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+\keyword{logic}
+
+
+
Added: pkg/man/rndfp.Rd
===================================================================
--- pkg/man/rndfp.Rd (rev 0)
+++ pkg/man/rndfp.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,32 @@
+\name{random.fingerprint}
+\alias{random.fingerprint}
+\title{
+ Generate Randomized Fingerprints
+}
+\description{
+ A utility function that can be used to generate binary fingerprints
+ of a specified length with a specifed number of bit positions
+ (selected randomly) set to 1. Currently bit positions are selected uniformly
+}
+\usage{
+random.fingerprint(nbit,on)
+}
+\arguments{
+ \item{nbit}{
+ The length of the fingerprint, that is, the total number of bits.
+ Must be a positive integer.
+ }
+ \item{on}{
+ How many positions should be set to 1
+ }
+}
+\value{
+An object of class \code{fingerprint}
+}
+\examples{
+# make a fingerprint vector
+fp <- random.fingerprint(32, 16)
+as.character(fp)
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
Added: pkg/man/show.Rd
===================================================================
--- pkg/man/show.Rd (rev 0)
+++ pkg/man/show.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,22 @@
+\name{show}
+\alias{show,fingerprint-method}
+\title{
+ String Representation of a Fingerprint
+}
+\description{
+Simply summarize the fingerprint.
+}
+
+\usage{
+\S4method{show}{fingerprint}(object)
+}
+\arguments{
+ \item{object}{
+ An object of class \code{fingerprint}
+ }
+}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
+\keyword{logic}
+
+
+
Added: pkg/man/sim.Rd
===================================================================
--- pkg/man/sim.Rd (rev 0)
+++ pkg/man/sim.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,46 @@
+\name{fp.sim.matrix}
+\alias{fp.sim.matrix}
+\title{
+ Calculates a Similarity Matrix for a Set of Fingerprints
+}
+\description{
+Given a set of fingerprints, a pairwise similarity can be calculated using the
+various distance metrics defined for binary strings. This function calculates
+the pairwise similarity matrix for a set of \code{fingerprint} objectssupplied in a \code{list}
+structure. Any of the distance metrics provided by \code{\link{distance}} can be used and the
+default is the Tanimoto metric.
+
+Note that if the the Euclidean distance is specified then the resultant matrix is a
+distance matrix and not a similarity matrix
+}
+\usage{
+fp.sim.matrix(fplist, method='tanimoto')
+}
+\arguments{
+ \item{fplist}{
+ A list structure with each element being an object of class
+ \code{fingerprint}. These will can be constructed by hand or
+ read from disk via \code{\link{fp.read}}
+ }
+ \item{method}{
+ The type of distance metric to use. Alternatives are \code{euclidean} and
+ \code{dice} and \code{mt}. The default is \code{tanimoto}. Partial
+ matching is supported.
+ }
+}
+\value{
+A matrix with dimensions equal to \code{(length(fplist), length(fplist))}
+}
+\seealso{
+ \code{\link{distance}}, \code{\link{fp.read}}
+}
+\examples{
+# make fingerprint objects
+fp1 <- new("fingerprint", nbit=6, bits=c(1,2,5,6))
+fp2 <- new("fingerprint", nbit=6, bits=c(1,4,5,6))
+fp3 <- new("fingerprint", nbit=6, bits=c(2,3,4,5,6))
+
+fp.sim.matrix( list(fp1,fp2,fp3) )
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
Added: pkg/man/string.Rd
===================================================================
--- pkg/man/string.Rd (rev 0)
+++ pkg/man/string.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,32 @@
+\name{as.character}
+\alias{as.character}
+\alias{as.character,fingerprint-method}
+\title{
+ Generates a String Representation of a Fingerprint
+}
+\description{
+ The function returns a string of 1's and 0's corresponding to the
+ fingerprint object supplied
+}
+\usage{
+\S4method{as.character}{fingerprint}(x)
+}
+\arguments{
+ \item{x}{
+ An object of class \code{fingerprint}
+ }
+}
+\value{
+A string of 1's and 0's
+}
+
+\examples{
+# make a fingerprint vector
+fp <- new("fingerprint", nbit=32, bits=sample(1:32, 20))
+
+# print out the string representation
+as.character(fp)
+}
+\keyword{logic}
+\keyword{methods}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
Added: pkg/man/vec.Rd
===================================================================
--- pkg/man/vec.Rd (rev 0)
+++ pkg/man/vec.Rd 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,33 @@
+\name{euc.vector}
+\alias{euc.vector}
+\title{
+ Euclidean Representation of Binary Fingerprints
+}
+\description{
+ Ordinarily, a binary fingerprint can be considered to represent a
+ corner of a nD hypercube. However in many cases using such a representation
+ can lead to a very sparse space. Consequently one approach is to convert
+ the fingerprint so that it represents points on a nD unit hypersphere.
+
+ The resultant fingerprint is then a nD coordinate.
+}
+\usage{
+euc.vector(fp)
+}
+\arguments{
+ \item{fp}{
+ An object of class \code{fingerprint}.
+ }
+}
+\value{
+A numeric of length equal to the bit length of the fingerprint. The
+result corresponds to a unit vector for a point
+on the nD hypersphere
+}
+\examples{
+# make a fingerprint vector
+fp <- new("fingerprint", nbit=8, bits=c(1,3,4,5,7))
+vec <- euc.vector(fp)
+}
+\keyword{logic}
+\author{Rajarshi Guha \email{rguha at indiana.edu}}
Added: pkg/src/fpdistance.c
===================================================================
--- pkg/src/fpdistance.c (rev 0)
+++ pkg/src/fpdistance.c 2008-10-24 21:02:21 UTC (rev 2)
@@ -0,0 +1,59 @@
+#include <R.h>
+#include <R_ext/Utils.h>
+
+#define X(_m,_i,_j,_nrow) _m[ _i + _nrow * _j ]
+
+#define METRIC_TANIMOTO 1
+#define METRIC_EUCLIDEAN 2
+
+double d_tanimoto(double*,double*,int);
+double d_euclidean(double*,double*,int);
+
+/**
+fp1 and fp2 should be an array of 1's and 0's, of
+length equal to the size of the fingerprint
+**/
+void fpdistance(double *fp1, double *fp2, int *nbit, int *metric, double *ret) {
+ double r = 0.0;
+ switch(*metric) {
+ case METRIC_TANIMOTO:
+ r = d_tanimoto(fp1, fp2, *nbit);
+ break;
+ case METRIC_EUCLIDEAN:
+ r = d_euclidean(fp1, fp2, *nbit);
+ }
+ *ret = r;
+ return;
+}
+
+/**
+http://www.daylight.com/dayhtml/doc/theory/theory.finger.html
+**/
+double d_tanimoto(double *fp1, double *fp2, int nbit) {
+ int i,j;
+ int nc = 0;
+ int na = 0;
+ int nb = 0;
+ if (nbit <= 0) return(-1.0);
+ for (i = 0; i < nbit; i++) {
+ if (fp1[i] == 1 && fp2[i] == 1) nc++;
+ if (fp1[i] == 1 && fp2[i] == 0) na++;
+ if (fp2[i] == 1 && fp1[i] == 0) nb++;
+ }
+ return ((double) nc) / (double) (na + nb + nc);
+}
+
+/**
+http://www.daylight.com/dayhtml/doc/theory/theory.finger.html
+**/
+double d_euclidean(double *fp1, double *fp2, int nbit) {
+ int i,j;
+ int nc = 0;
+ int nd = 0;
+ if (nbit <= 0) return(-1.0);
+ for (i = 0; i < nbit; i++) {
+ if (fp1[i] == 1 && fp2[i] == 1) nc++;
+ if (fp1[i] == 0 && fp2[i] == 0) nd++;
+ }
+ return sqrt(((double) nc + (double) nd) / (double) nbit);
+}
More information about the Fingerprint-commits
mailing list