[Depmix-commits] r360 - papers/jss

Tue Feb 23 12:57:50 CET 2010

Author: ingmarvisser
Date: 2010-02-23 12:57:49 +0100 (Tue, 23 Feb 2010)
New Revision: 360

Removed:
   papers/jss/latex.sh
Modified:
   papers/jss/dpx4.sh
   papers/jss/dpx4Rev.R
   papers/jss/dpx4Rev.Rnw
   papers/jss/dpx4Rev.tex
Log:
Completed turning paper into Sweave format; .sh file contains cmds to generate the paper.

Modified: papers/jss/dpx4.sh
===================================================================

--- papers/jss/dpx4.sh	2010-02-22 23:05:48 UTC (rev 359)
+++ papers/jss/dpx4.sh	2010-02-23 11:57:49 UTC (rev 360)
@@ -1,17 +1,19 @@
 #!/bin/sh
 
-R --vanilla < dpx4Sweave.R 
+cd ~/Documents/projects/depmixProject/codesvn/depmix/papers/jss/
 
-R --vanilla < dpx4Stangle.R
+R --vanilla < dpx4Sweave.R ;
 
-pdflatex dpx4Rev.tex
+R --vanilla < dpx4Stangle.R ;
 
-bibtex dpx4Rev
+pdflatex dpx4Rev.tex ;
 
-pdflatex dpx4Rev.tex
+bibtex dpx4Rev ;
 
-pdflatex dpx4Rev.tex
+pdflatex dpx4Rev.tex ;
 
+pdflatex dpx4Rev.tex ;
+
 open dpx4Rev.pdf
 
 

Modified: papers/jss/dpx4Rev.R
===================================================================
--- papers/jss/dpx4Rev.R	2010-02-22 23:05:48 UTC (rev 359)
+++ papers/jss/dpx4Rev.R	2010-02-23 11:57:49 UTC (rev 360)
@@ -1,49 +1,256 @@
 ###################################################
 ### chunk number 1: 
 ###################################################
-1:10
+library(depmixS4)
+data(speed)
+plot(as.ts(speed[1:168,]),main="Speed-accuracy trade-off")
 
 
 ###################################################
 ### chunk number 2: 
 ###################################################
-print(1:20)
+library(depmixS4)
+data(speed)
+set.seed(1)
+mod <- depmix(rt~1, data=speed, nstates=2, trstart=runif(4))
 
 
 ###################################################
 ### chunk number 3: 
 ###################################################
-1 + 1
-1 + pi
-sin(pi/2)
+fm <- fit(mod)
 
 
 ###################################################
 ### chunk number 4: 
 ###################################################
-library(stats)
-x <- rnorm(20)
-print(x)
-print(t1 <- t.test(x))
+fm
 
 
 ###################################################
 ### chunk number 5: 
 ###################################################
-data(iris)
-summary(iris)
+summary(fm)
 
 
 ###################################################
 ### chunk number 6: 
 ###################################################
-library(graphics)
-pairs(iris)
+set.seed(1)
+mod <- depmix(rt~1, data=speed, nstates=2, family=gaussian(), 
+    transition=~scale(Pacc), instart=runif(2))
+fm <- fit(mod)
 
 
 ###################################################
 ### chunk number 7: 
 ###################################################
-boxplot(Sepal.Length~Species, data=iris)
+summary(fm)
 
 
+###################################################
+### chunk number 8: 
+###################################################
+set.seed(1)
+mod <- depmix(list(rt~1,corr~1), data=speed, nstates=2, 
+    family=list(gaussian(),multinomial()),
+    transition=~scale(Pacc),instart=runif(2))
+fm <- fit(mod)
+
+
+###################################################
+### chunk number 9: 
+###################################################
+summary(fm)
+
+
+###################################################
+### chunk number 10: 
+###################################################
+data(balance)
+balance$age <- balance$age-5
+set.seed(1)
+mod <- mix(list(d1~1,d2~1,d3~1,d4~1), data=balance, nstates=2,
+    family=list(multinomial(), multinomial(), multinomial(),
+    multinomial()), respstart=c(rep(c(0.9,0.1),4),rep(c(0.1,0.9),4)), 
+    prior=~age, initdata=balance)
+fm <- fit(mod)
+
+
+###################################################
+### chunk number 11: 
+###################################################
+summary(fm)
+
+
+###################################################
+### chunk number 12: 
+###################################################
+setpars(mod, value=1:npar(mod))
+
+
+###################################################
+### chunk number 13: 
+###################################################
+setpars(mod, getpars(mod,which="fixed"))
+
+
+
+###################################################
+### chunk number 14: 
+###################################################
+trst <- c(0.9,0.1,0,0,0.1,0.9,0,0)
+mod <- depmix(list(rt~1,corr~1),data=speed,transition=~Pacc,
+	  nstates=2,family=list(gaussian(),multinomial()),
+	  trstart=trst,inst=c(.999,0.001))
+fm1 <- fit(mod)
+
+
+###################################################
+### chunk number 15: 
+###################################################
+# start with fixed and free parameters
+conpat <- c(0,1,rep(c(0,1),4),1,1,0,1,1,1,0,1)
+# constrain the beta's on the transition parameters to be equal
+conpat[6] <- conpat[10] <- 2
+fm2 <- fit(fm1,equal=conpat)
+
+
+###################################################
+### chunk number 16: 
+###################################################
+setClass("exgaus", contains="response")
+
+
+###################################################
+### chunk number 17: 
+###################################################
+setGeneric("exgaus", function(y, pstart = NULL, fixed = NULL, ...) 
+  standardGeneric("exgaus"))
+
+require(gamlss)
+require(gamlss.dist)
+
+setMethod("exgaus", 
+  signature(y="ANY"), 
+  function(y,pstart=NULL,fixed=NULL, ...) {
+    y <- matrix(y,length(y))
+    x <- matrix(1) 
+    parameters <- list()
+    npar <- 3
+    if(is.null(fixed)) fixed <- as.logical(rep(0,npar))
+    if(!is.null(pstart)) {
+      if(length(pstart)!=npar) stop("length of 'pstart' must be ",npar)
+      parameters$mu <- pstart[1]
+      parameters$sigma <- log(pstart[2])
+      parameters$nu <- log(pstart[3])
+    }
+    mod <- new("exgaus",parameters=parameters,fixed=fixed,x=x,y=y,npar=npar)
+    mod
+  }
+)
+
+
+###################################################
+### chunk number 18: 
+###################################################
+setMethod("dens","exgaus",
+    function(object,log=FALSE) {
+        dexGAUS(object at y, mu = predict(object), sigma = exp(object at parameters$sigma), nu = exp(object at parameters$nu), log = log)
+    }
+)
+
+setMethod("getpars","response",
+    function(object,which="pars",...) {
+        switch(which,
+            "pars" = {
+                parameters <- numeric()
+                parameters <- unlist(object at parameters)
+                pars <- parameters
+            },
+            "fixed" = {
+                pars <- object at fixed
+            }
+        )
+        return(pars)
+    }
+)
+
+setMethod("setpars","exgaus",
+    function(object, values, which="pars", ...) {
+        npar <- npar(object)
+        if(length(values)!=npar) stop("length of 'values' must be",npar)
+        # determine whether parameters or fixed constraints are being set
+		nms <- names(object at parameters)
+		switch(which,
+		  "pars"= {
+		      object at parameters$mu <- values[1]
+		      object at parameters$sigma <- values[2]
+		      object at parameters$nu <- values[3]
+		      },
+		  "fixed" = {
+		      object at fixed <- as.logical(values)
+		  }
+		  )
+        names(object at parameters) <- nms
+        return(object)
+    }
+)
+
+setMethod("predict","exgaus", 
+    function(object) {
+        ret <- object at parameters$mu
+        return(ret)
+    }
+)
+
+
+
+###################################################
+### chunk number 19: 
+###################################################
+setMethod("fit","exgaus",
+  function(object,w) {
+    if(missing(w)) w <- NULL
+    y <- object at y
+    fit <- gamlss(y~1,weights=w,family=exGAUS(),
+      control=gamlss.control(n.cyc=100,trace=FALSE),
+      mu.start=object at parameters$mu,
+      sigma.start=exp(object at parameters$sigma),
+      nu.start=exp(object at parameters$nu))
+    pars <- c(fit$mu.coefficients,fit$sigma.coefficients,fit$nu.coefficients)
+    object <- setpars(object,pars)
+    object
+  }
+)
+
+
+###################################################
+### chunk number 20: 
+###################################################
+rModels <- list( list( 
+		exgaus(speed$rt,pstart=c(5,.1,.1)),	
+		GLMresponse(formula=corr~1,data=speed,family=multinomial(),pstart=c(0.5,0.5))),
+	list(
+		exgaus(speed$rt,pstart=c(6,.1,.1)),
+		GLMresponse(formula=corr~1,data=speed,family=multinomial(),pstart=c(.1,.9))))
+
+
+###################################################
+### chunk number 21: 
+###################################################
+trstart=c(0.9,0.1,0.1,0.9)
+transition <- list()
+transition[[1]] <- transInit(~Pacc,nst=2,data=speed,pstart=c(0.9,0.1,0,0))
+transition[[2]] <- transInit(~Pacc,nst=2,data=speed,pstart=c(0.1,0.9,0,0))
+inMod <- transInit(~1,ns=2,pstart=c(0.1,0.9),data=data.frame(1))
+
+
+###################################################
+### chunk number 22: 
+###################################################
+mod <- makeDepmix(response=rModels,transition=transition,
+    prior=inMod,stat=FALSE)
+fm <- fit(mod)
+
+

Modified: papers/jss/dpx4Rev.Rnw
===================================================================
--- papers/jss/dpx4Rev.Rnw	2010-02-22 23:05:48 UTC (rev 359)
+++ papers/jss/dpx4Rev.Rnw	2010-02-23 11:57:49 UTC (rev 360)
@@ -83,15 +83,16 @@
 
 %\batchmode
 
-
-
 \SweaveOpts{echo=FALSE}
 \usepackage{a4wide}
 
-% \batchmode
+%\usepackage{Sweave}
 
 \begin{document}
 
+%set width of figures produced by Sweave
+\setkeys{Gin}{width=0.9\textwidth}
+
 \maketitle
 
 %% include your article here, just as usual
@@ -103,72 +104,856 @@
 \bf{DRAFT: DO NOT QUOTE WITHOUT CONTACTING AUTHOR}
 \end{center}
 
+\section{Introduction}
+%% Note: If there is markup in \(sub)section, then it has to be escape as above.
 
-A simple example that will run in any S engine: The integers from 1 to
-10 are
-<<print=TRUE>>=
-1:10
-<<results=hide>>=
-print(1:20)
-@ % the above is just to ensure that 2 code chunks can follow each other
+Markov and latent Markov models are frequently used in the social
+sciences, in different areas and applications.  In psychology, they
+are used for modelling learning processes (see \citealp{Wickens1982},
+for an overview, and e.g. \citealp{Schmittmann2006}, for a recent
+application).  In economics, latent Markov models are so-called regime
+switching models (see e.g., \citealp{Kim1994} and
+\citealp{Ghysels1994}).  Further applications include speech
+recognition \citep{Rabiner1989}, EEG analysis \citep{Rainer2000}, and
+genetics \citep{Krogh1998}.  In these latter areas of application,
+latent Markov models are usually referred to as hidden Markov models.
+See for example \citet{Fruhwirth2006} for an overview of hidden Markov
+models with extensions.  Further examples of applications can be found
+in e.g.\ \citet[][chapter~1]{Cappe2005}.
 
-We can also emulate a simple calculator:
-<<echo=TRUE,print=TRUE>>=
-1 + 1
-1 + pi
-sin(pi/2)
+The \pkg{depmixS4} package was motivated by the fact that while Markov
+models are used commonly in the social sciences, no comprehensive
+package was available for fitting such models.  Existing software for
+estimating Markovian models include Panmark \citep{Pol1996}, and for
+latent class models Latent Gold \citep{Vermunt2003}.  These programs
+lack a number of important features, besides not being freely
+available.  There are currently some packages in \proglang{R} that
+handle hidden Markov models but they lack a number of features that we
+needed in our research.  In particular, \pkg{depmixS4} was designed to
+meet the following goals:
+
+\begin{enumerate}
+	
+	\item to be able to estimate parameters subject to general
+	linear (in)equality constraints
+	
+	\item to be able to fit transition models with covariates, i.e.,
+	to have time-dependent transition matrices
+	
+	\item to be able to include covariates in the prior or initial
+	state probabilities
+	
+	\item to be easily extensible, in particular, to allow users to
+	easily add new uni- or multivariate response distributions and
+	new transition models, e.g., continuous time observation models
+	
+\end{enumerate}
+
+Although \pkg{depmixS4} was designed to deal with longitudinal or time
+series data, for say $T>100$, it can also handle the limit case when
+$T=1$.  In this case, there are no time dependencies between observed
+data and the model reduces to a finite mixture or latent class model.
+While there are specialized packages to deal with mixture data, as far
+as we know these don't allow the inclusion of covariates on the prior
+probabilities of class membership.  The possibility to estimate the
+effects of covariates on prior and transition probabilities is a
+distinguishing feature of \pkg{depmixS4}.  In the next section, we
+provide an outline of the model and likelihood equations.
+
+
+\section{The dependent mixture model}
+
+The data considered here have the general form $\vc{O}_{1:T}=
+(O_{1}^{1}, \ldots, O_{1}^{m}$, $O_{2}^{1}, \ldots, O_{2}^{m}$,
+\ldots, $O_{T}^{1}, \ldots, O_{T}^{m})$ for an $m$-variate time series
+of length $T$.  As an example, consider a time series of responses
+generated by a single participant in a psychological response time
+experiment.  The data consists of three variables, response time,
+response accuracy, and a covariate which is a pay-off variable
+reflecting the relative reward for speeded and/or accurate responding.
+These variables are measured on 168, 134 and 137 occasions
+respectively (the first part of this series is plotted in
+Figure~\ref{fig:speed}).  These data are more fully described in
+\citet{Dutilh2009}.
+
+\begin{figure}[htbp]
+\begin{center}
+<<fig=TRUE,echo=FALSE,height=5,width=7,eps=FALSE>>=
+library(depmixS4)
+data(speed)
+plot(as.ts(speed[1:168,]),main="Speed-accuracy trade-off")
 @
+	 \caption{Response times (rt), accuracy (corr) and pay-off values (Pacc) for
+	  the first series of responses in dataset \code{speed}.}
+	  \label{fig:speed}
+	  
+\end{center}
+\end{figure}
 
-Now we look at Gaussian data:
+The latent Markov model is usually associated with data of this type,
+in particular for multinomially distributed responses.  However,
+commonly employed estimation procedures \citep[e.g.,][]{Pol1996}, are
+not suitable for long time series due to underflow problems.  In
+contrast, the hidden Markov model is typically only used for `long'
+univariate time series \citep[][chapter~1]{Cappe2005}.  We use the
+term ``dependent mixture model'' because one of the authors (Ingmar
+Visser) thought it was time for a new name to relate these
+models\footnote{Only later did I find out that \citet{Leroux1992}
+already coined the term dependent mixture models in an application
+with hidden Markov mixtures of Poisson count data.}.
 
+The fundamental assumption of a dependent mixture model is that at any
+time point, the observations are distributed as a mixture with $n$
+components (or states), and that time-dependencies between the
+observations are due to time-dependencies between the mixture
+components (i.e., transition probabilities between the components).
+These latter dependencies are assumed to follow a first-order Markov
+process.  In the models we are considering here, the mixture
+distributions, the initial mixture probabilities and transition
+probabilities can all depend on covariates $\vc{z}_t$.
+
+%transition probability functions $a_{ij}$ and the initial state
+%probability functions $\greekv{\pi}$ may depend on covariates as well
+%as the response distributions $\vc{b_{j}^{k}}$.  See for example
+%\citet{Fruhwirth2006} for an overview of hidden Markov models with
+%extensions.
+
+In a dependent mixture model, the joint likelihood of observations
+$\vc{O}_{1:T}$ and latent states $\vc{S}_{1:T} = (S_1,\ldots,S_T)$,
+given model parameters $\greekv{\theta}$ and covariates $\vc{z}_{1:T}
+= (\vc{z}_1,\ldots,\vc{z}_T)$, can be written as:
+\begin{equation}
+	\Prob(\vc{O}_{1:T},\vc{S}_{1:T}|\greekv{\theta},\vc{z}_{1:T}) =  
+	\pi_{i}(\vc{z}_1) \vc{b}_{S_t}(\vc{O}_1|\vc{z}_{1})
+	\prod_{t=1}^{T-1} a_{ij}(\vc{z}_{t}) \vc{b}_{S_t}(\vc{O}_{t+1}|\vc{z}_{t+1}),
+\end{equation}
+where we have the following elements:
+\begin{enumerate}
+	
+	\item $S_{t}$ is an element of $\mathcal{S}=\{1\ldots n\}$, a set
+	of $n$ latent classes or states.
+	
+	\item $\pi_{i}(\vc{z}_1) = \Prob(S_1 = i|\vc{z}_1)$, giving the
+	probability of class/state $i$ at time $t=1$ with covariate
+	$\vc{z}_1$.
+	
+	\item $a_{ij}(\vc{z}_t) = \Prob(S_{t+1}=j|S_{t}=i,\vc{z}_t)$,
+	provides the probability of a transition from state $i$ to state
+	$j$ with covariate $\vc{z}_t$,
+	
+	\item $\vc{b}_{S_t}$ is a vector of observation densities
+	$b_{j}^k(\vc{z}_t) = \Prob(O_{t}^k|S_t = j, \vc{z}_t)$ that
+	provide the conditional densities of observations $O_{t}^k$
+	associated with latent class/state $j$ and covariate $\vc{z}_t$,
+	$j=1, \ldots, n$, $k=1, \ldots, m$.
+	
+\end{enumerate}
+
+%In the next paragraphs, the likelihood
+%equation and estimation procedures for the dependent mixture model are
+%described for data of the above form.  
+
+%The likelihood of the dependent mixture model conditional on the
+%unknown (or hidden) state sequence $\vc{S}_{1:T} = (S_1,\ldots,S_T)$ 
+%and the model is given
+%by:
+%\begin{equation}
+%	\Prob( \vc{O}_{1:T} | \vc{S}_{1:T}, \greekv{\theta} ) = 
+%	\prod_{t=1}^{T}  \Prob( \vc{O}_{t} | S_{t},  \greekv{\theta}),
+%\end{equation}
+%where $\greekv{\theta}$ is the parameter vector of the model.  To arrive
+%at the marginal likelihood of the data for parameter vector
+%$\greekv{\theta}$, i.e.\ unconditional on the state sequence, we need to sum
+%above likelihood over all possible state sequences. This likelihood 
+%is written as: 
+%\begin{equation}
+%	\Prob(\vc{O}_{1:T}|\greekv{\theta}) =  
+%	\sum_{all Ss} \pi_{i}(\vc{z}_1) \vc{b}_{1}(\vc{O}_{1}) 
+%	\prod_{t=1}^{T-1} a_{ij}(\vc{z}_{t}) \vc{b}_{j}(\vc{O}_{t+1}|\vc{z}_{t+1}),
+%\end{equation}
+%where we have the following elements:
+%\begin{enumerate}
+%	
+%	\item $S_{t}$ is an element of $\mathcal{S}=\{1\ldots n\}$, a set
+%	of $n$ latent classes or states; we write for short $S_{i}$ to
+%	denote $S_{t}=i$.
+%	
+%	\item $\pi_{i}(\vc{z}_1) = \Prob(S_1 = i|\vc{z}_1)$,
+%	giving the probability of class/state $i$ at time $t=1$ with
+%	covariate $\vc{z}_1$.
+%	
+%	\item $a_{ij}(\vc{z}_t) = \Prob(S_{t+1}=j|S_{t}=i,\vc{z}_t)$,
+%	provides the probability of a transition from state $i$ to state
+%	$j$ with covariate $\vc{z}_t$,
+%	
+%	\item $\vc{b}_{j}$ is a vector of observation densities
+%	$b_j^k(\vc{z}_t) = \Prob(O_{t}^k|S_t = j, \vc{z}_t)$ that provide the
+%	conditional densities of observations $O_{t}^k$ associated with
+%	latent class/state $j$ and covariate $\vc{z}_t$, $j=1, \ldots, n$,
+%	$k=1, \ldots, m$. 
+%\end{enumerate}
+
+% The dependent mixture model is defined by the following equations: 
+% %\begin{align}
+% %	S_{t} &= A S_{t-1}, t=2, \ldots, T \\
+% %	O_{t} &= b(O_{t}|S_{t}), 
+% %\end{align}
+% \begin{align}
+% 	Pr(S_{t} = j) &= \sum_{i=1}^n a_{ij}(\vc{x}_t) Pr(S_{t-1} = i), t=2, \ldots, T \\
+% 	p(O_{t}^k|S_t &= j, \vc{x}_t) = b_j^k(\vc{x}_t)
+% \end{align}
+%where $S_{t}$ is a sequence of hidden states, $A$ is a transition
+%matrix, $O_{t}$ is an (possibly multivariate) observation and $b$ is a
+%density function for $O_{t}$ conditional on the hidden state $S_{t}$.
+
+For the example data above, $b_j^k$ could be a Gaussian distribution
+function for the response time variable, and a Bernoulli distribution
+for the accuracy variable.  In the models we are considering here,
+both the transition probability functions $a_{ij}$ and the initial
+state probability functions $\greekv{\pi}$ may depend on covariates as
+well as the response distributions $b_{j}^{k}$.
+
+\subsection{Likelihood}
+
+To obtain maximum likelihood estimates of the model parameters, we
+need the marginal likelihood of the observations.  For hidden Markov
+models, this marginal (log-)likelihood is usually computed by the
+so-called forward-backward algorithm \citep{Baum1966,Rabiner1989}, or
+rather by the forward part of this algorithm.  \cite{Lystig2002}
+changed the forward algorithm in such a way as to allow computing the
+gradients of the log-likelihood at the same time.  They start by
+rewriting the likelihood as follows (for ease of exposition the
+dependence on the model parameters and covariates is dropped here):
+\begin{equation}
+	L_{T} = \Prob(\vc{O}_{1:T}) = \prod_{t=1}^{T} 
+	\Prob(\vc{O}_{t}|\vc{O}_{1:(t-1)}), 
+	\label{condLike}
+\end{equation}
+where $\Prob(\vc{O}_{1}|\vc{O}_{0}):=\Prob(\vc{O}_{1})$. Note that for a 
+simple, i.e.\ observed, Markov chain these probabilities reduce to 
+$\Prob(\vc{O}_{t}|\vc{O}_{1},\ldots, 
+\vc{O}_{t-1})=\Prob(\vc{O}_{t}|\vc{O}_{t-1})$.
+The log-likelihood can now be expressed as:
+\begin{equation}
+	l_{T} = \sum_{t=1}^{T} \log[\Prob(\vc{O}_{t}|\vc{O}_{1:(t-1)}].
+	\label{eq:condLogl}
+\end{equation}
+
+To compute the log-likelihood, \cite{Lystig2002} define the following 
+(forward) recursion:
+\begin{align}
+	\phi_{1}(j) &:= \Prob(\vc{O}_{1}, S_{1}=j) = \pi_{j} b_{j}(\vc{O}_{1}) 
+	\label{eq:fwd1} \\
+%\begin{split}
+	\phi_{t}(j) &:= \Prob(\vc{O}_{t}, S_{t}=j|\vc{O}_{1:(t-1)}) %\\
+	= \sum_{i=1}^{N} [\phi_{t-1}(i)a_{ij}b_{j}(\vc{O}_{t})] \times 
+(\Phi_{t-1})^{-1},
+	\label{eq:fwdt} 
+%\end{split} 
+\end{align}
+where $\Phi_{t}=\sum_{i=1}^{N} \phi_{t}(i)$. Combining 
+$\Phi_{t}=\Prob(\vc{O}_{t}|\vc{O}_{1:(t-1)})$, and 
+equation~(\ref{eq:condLogl}) gives the following expression for the 
+log-likelihood:
+\begin{equation}
+	l_{T} = \sum_{t=1}^{T} \log \Phi_{t}.
+	\label{eq:logl}
+\end{equation}
+
+
+\subsection{Parameter estimation}
+
+Parameters are estimated in \pkg{depmixS4} using the EM algorithm or
+through the use of a general Newton-Raphson optimizer.  In the EM
+algorithm, parameters are estimated by iteratively maximising the
+expected joint likelihood of the parameters given the observations and
+states.  Let $\greekv{\theta} = (\greekv{\theta}_1,
+\greekv{\theta}_2,\greekv{\theta}_3)$ be the general parameter vector
+consisting of three subvectors with parameters for the prior model,
+transition model, and response model respectively.  The joint
+log-likelihood can be written as
+\begin{equation}
+\log \Prob(\vc{O}_{1:T}, \vc{S}_{1:T}|\vc{z}_{1:T},\greekv{\theta}) = \log 
+\Prob(S_1|\vc{z}_{1},\greekv{\theta}_1) 
++ \sum_{t=2}^{T} \log \Prob(S_t|S_{t-1},\vc{z}_{t-1},\greekv{\theta}_2) 
++ \sum_{t=1}^{T} \log \Prob(\vc{O}_t|S_t,\vc{z}_t,\greekv{\theta}_3)
+\end{equation}
+This likelihood depends on the unobserved states $\vc{S}_{1:T}$. In the 
+Expectation step, we replace these with their expected values given a set of 
+(initial) parameters $\greekv{\theta}' = (\greekv{\theta}'_1, 
+\greekv{\theta}'_2,\greekv{\theta}'_3)$ and observations $\vc{O}_{1:T}$. The expected 
+log likelihood 
+\begin{equation}
+Q(\greekv{\theta},\greekv{\theta}') = E_{\greekv{\theta}'} 
+(\log \Prob(\vc{O}_{1:T},\vc{S}_{1:T}|\vc{O}_{1:T},\vc{z}_{1:T},\greekv{\theta}))
+\end{equation}
+can be written as
+%\begin{equation}
+\begin{multline}
+\label{eq:Q}
+Q(\greekv{\theta},\greekv{\theta}') = 
+\sum_{j=1}^n \gamma_1(j) \log \Prob(S_1=j|\vc{z}_1,\greekv{\theta}_1) \\ 
++ \sum_{t=2}^T \sum_{j=1}^n \sum_{k=1}^n \xi_t(j,k) \log \Prob(S_t = k|S_{t-1} 
+= j,\vc{z}_{t-1},\greekv{\theta}_2)  \\
+ + \sum_{t=1}^T \sum_{j=1}^n \sum_{k=1}^m \gamma_t(j) 
+\ln \Prob(O^k_t|S_t=j,\vc{z}_t,\greekv{\theta}_3),
+\end{multline}
+%\end{equation}
+where the expected values $\xi_t(j,k) =  P(S_t = k, S_{t-1} = j|\vc{O}_{1:T},
+\vc{z}_{1:T},\greekv{\theta}')$ and $\gamma_t(j) = P(S_t = j|\vc{O}_{1:T},
+\vc{z}_{1:T},\greekv{\theta}')$ can be computed effectively by the 
+forward-backward algorithm \citep[see e.g.,][]{Rabiner1989}. The Maximisation 
+step consists of the maximisation of (\ref{eq:Q}) for $\greekv{\theta}$. As the 
+right hand side of (\ref{eq:Q}) consists of three separate parts, we can 
+maximise separately for $\greekv{\theta}_1$, $\greekv{\theta}_2$ and 
+$\greekv{\theta}_3$. In common models, maximisation for $\greekv{\theta}_1$ and 
+$\greekv{\theta}_2$ is performed by the \code{nnet.default} routine in the 
+\pkg{nnet} package \citep{Venables2002}, and maximisation for 
+$\greekv{\theta}_3$ by the standard \code{glm} routine. Note that for the latter 
+maximisation, the expected values $\gamma_t(j)$ are used as prior weights of the 
+observations $O^k_t$.
+
+
+
+%The maximising values of $a_{jk}$ are \cite<e.g.,>{Rabiner89}
+%\begin{equation}
+%\label{eq:MLA}
+%\hat{a}_{jk} = \frac{1}{N(T-1)} \sum_{i=1}^N \sum_{t=2}^T  \frac{\xi^i_t(j,k)}{\gamma_{t-1}^i(j)}.
+%\end{equation}
+%Fixing certain elements of $\mat{A}$ to 0, as in the DT version, does not affect the estimation of the other elements. When elements $a_{jk}$ are assumed identical, we simply extend the summation in (\ref{eq:MLA}) to include all those elements, changing the denominator $N(T-1)$ accordingly.
+
+%To estimate $\greekv{\lambda}$, we note that the term containing this parameter has the form of a weighted likelihood, where $\gamma_t^i(j)$ can be interpreted as the number of replications of $r^i_t$. Hence, we can rely on standard ML estimates of the logistic regression coefficients $\lambda_i$, using the values $\gamma_t^i(j)$ as ``case weights'' \cite<e.g.,>{Agresti02,McCullagh83}\footnote{To be more specific, we replicate each observation $r^i_t$ a total of 6 times, once for each of the states besides the random state (which offers no information regarding $\lambda_i$). For the $j$-th replication (corresponding to the $j$-th state), we used $v_j(\vec{x}_t)$ as a predictor variable and $\gamma^i_t(j)$ as a case weight. All these replications were used to obtain the maximum likelihood estimate of $\lambda$ from a single GLM, using the ``glm.fit'' function in R \cite{R}.}. The resulting parameter estimates are then used in a new Expectation step, followed by another Maximisation step, until convergence. Under mild regularity conditions, it can be shown that the EM algorithm converges to a (local) maximum of the likelihood \cite{EM}. 
+
+
+The EM algorithm however has some drawbacks.  First, it can be slow to
+converge towards the end of optimization.  Second, applying
+constraints to parameters can be problematic; in particular, EM can
+lead to wrong parameter estimates when applying constraints.  Hence,
+in \pkg{depmixS4}, EM is used by default in unconstrained models, but
+otherwise, direct optimization is done using \pkg{Rdonlp2}
+\citep{Tamura2009,Spellucci2002}, because it handles general linear
+(in)equality constraints, and optionally also non-linear constraints.
+
+%Need some more on EM and how/why it is justified to do separate weighted
+%fits of the response models and transition and prior models. 
+
+%Also mention use of glm, nnet and possibly other packages that we use. 
+
+\section[Using depmixS4]{Using \pkg{depmixS4}}
+
+Two steps are involved in using \pkg{depmixS4} which are illustrated
+below with examples:
+\begin{enumerate}
+	\item model specification with function \code{depmix} (or with \code{mix}
+	for latent class and finite mixture models, see example below on adding
+	covariates to prior probabilities)
+	
+	\item  model fitting with function \code{fit}
+\end{enumerate}
+We have separated the stages of model specification and model fitting
+because fitting large models can be fairly time-consuming and it is
+hence useful to be able to check the model specification before
+actually fitting the model.
+
+\subsection[Example data: speed]{Example data: \code{speed}}
+
+Throughout this article a data set called \code{speed} is used.  As
+already indicated in the Introduction, it consists of three time
+series with three variables: response time, accuracy, and a covariate
+Pacc which defines the relative pay-off for speeded versus accurate
+responding.  The participant in this experiment switches between fast
+responding at chance level and relatively slower responding at a high
+level of accuracy.  Interesting hypotheses to test are: is the
+switching regime symmetric?  Is there evidence for two states or does
+one state suffice?  Is the guessing state actually a guessing state,
+i.e., is the probability of a correct response at chance level (0.5)?
+
+\subsection{A simple model}
+
+A dependent mixture model is defined by the number of states and the
+initial state, state transition, and response distribution functions.
+A dependent mixture model can be created with the
+\code{depmix}-function as follows:
+
 <<>>=
-library(stats)
-x <- rnorm(20)
-print(x)
-print(t1 <- t.test(x))
+library(depmixS4)
+data(speed)
+set.seed(1)
+mod <- depmix(rt~1, data=speed, nstates=2, trstart=runif(4))
 @
-Note that we can easily integrate some numbers into standard text: The
-third element of vector \texttt{x} is \Sexpr{x[3]}, the
-$p$-value of the test is \Sexpr{format.pval(t1$p.value)}. % $
 
-Now we look at a summary of the famous iris data set, and we want to
-see the commands in the code chunks:
+The \code{depmix} function returns an object of class \code{depmix}
+which contains the model specification (and not a fitted model!).
+Note also that start values for the transition parameters are provided
+in this call using the \code{trstart} argument.  At this time, the
+package does not provide automatic starting values.
 
-\SweaveOpts{echo=true}
+<<>>=
+fm <- fit(mod)
+@
+The \code{fit}-function returns an object of class
+\code{depmix.fitted} which extends the \code{depmix} class, adding
+convergence information (and information about constraints if these
+were applied, see below).  The \code{print} method provides summary
+information on convergence, the log-likelihood and the AIC and BIC
+values:
+<<>>=
+fm
+@
 
-% the following code is R-specific, as data(iris) will not run in Splus. 
-% Hence, we mark it as R code. 
-<<engine=R>>=
-data(iris)
-summary(iris)
-@ %def
 
+These statistics can also be extracted using \code{logLik}, \code{AIC}
+and \code{BIC}, respectively.  By comparison, a 1-state model for
+these data, i.e. assuming there is no mixture, has a log-likelihood of
+-305.33, and 614.66, and 622.83 for the AIC and BIC respectively.
+Hence, the 2-state model fits the data much better than the 1-state
+model.  Note that the 1-state model can be specified using \code{mod <-
+depmix(rt~1, data=speed, nstates=1)}, although this model is trivial
+as it will simply return the mean and the sd of the rt variable.
 
-\begin{figure}[htbp]
-  \begin{center}
-<<fig=TRUE>>=
-library(graphics)
-pairs(iris)
+The \code{summary} method of \code{fit}ted models provides the
+parameter estimates, first for the prior probabilities model, second
+for the transition model, and third for the response models.
+<<>>=
+summary(fm)
 @
-    \caption{Pairs plot of the iris data.}
-  \end{center}
-\end{figure}
 
-\begin{figure}[htbp]
-  \begin{center}
-<<fig=true>>=
-boxplot(Sepal.Length~Species, data=iris)
+
+Note that, since no further arguments were specified, the initial
+state, state transition and response distributions were set to their
+defaults (multinomial distributions for the first two, and Gaussian
+distributions for the response distributions).
+
+\subsection{Covariates on transition parameters}
+
+By default, the transition probabilities and the initial state
+probabilities are parameterized using the multinomial logistic model.
+More precisely, each row of the transition matrix is parameterized by
+a baseline category logistic multinomial, meaning that the parameter
+for the base category is fixed at zero \citep[see][p.\ 267 ff., for
+multinomial logistic models and various
+parameterizations]{Agresti2002}.  The default baseline category is the
+first state.  Hence, for example, for a 3-state model, the initial
+state probability model would have three parameters of which the first
+is fixed at zero and the other two are freely estimated.
+
+The multinomial logistic model allows us to include covariates on the
+initial state and transition probabilities.  \citet{Chung2007} discuss
+a related latent transition model for repeated measurement data
+($T=2$) using logistic regression on the transition parameters; they
+rely on Bayesian methods of estimation.  Covariates on the transition
+probabilities can be specified using a one-sided formula as in the
+following example:
+<<>>=
+set.seed(1)
+mod <- depmix(rt~1, data=speed, nstates=2, family=gaussian(), 
+    transition=~scale(Pacc), instart=runif(2))
+fm <- fit(mod)
 @
-    \caption{Boxplot of sepal length grouped by species.}
-  \end{center}
-\end{figure}
 
+Applying \code{summary} to the \code{fit}ted object gives (only transition models
+printed here): 
+<<>>=
+summary(fm)
+@
 
-% R is not S-PLUS, hence this chunk will be ignored:
-<<engine=S4>>=
-function.that.comes.only.with.Splus(x)
+The summary provides all parameters of the model, also the (redundant)
+zeroes for the base-line category in the multinomial model.  The
+summary also prints the transition probabilities at the zero value of
+the covariate.  Note that scaling of the covariate is useful in this
+regard as it makes interpretation of these intercept probabilities
+easier.
+
+\subsection{Multivariate data}
+
+Multivariate data can be modelled by providing a list of formulae as
+well as a list of family objects for the distributions of the various
+responses.  In above examples we have only used the response times
+which were modelled with the Gaussian distribution.  The accuracy
+variable in the \code{speed} data can be modelled with a multinomial
+by specifying the following:
+<<>>=
+set.seed(1)
+mod <- depmix(list(rt~1,corr~1), data=speed, nstates=2, 
+    family=list(gaussian(),multinomial()),
+    transition=~scale(Pacc),instart=runif(2))
+fm <- fit(mod)
 @
 
+This provides the following fitted model parameters (only the 
+response parameters are given here): 
+<<>>=
+summary(fm)
+@
+
+As can be seen, state 1 has fast response times and accuracy is
+approximately at chance level (.474), whereas state 2 corresponds with
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/depmix -r 360