[Lme4-commits] r1843 - www/JSS
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Thu Aug 22 23:45:39 CEST 2013
Author: walker
Date: 2013-08-22 23:45:39 +0200 (Thu, 22 Aug 2013)
New Revision: 1843
Modified:
www/JSS/glmer.Rnw
www/JSS/glmerAppendix.tex
www/JSS/lmer.Rnw
Log:
updated authors and started examples section in glmer.Rnw
Modified: www/JSS/glmer.Rnw
===================================================================
--- www/JSS/glmer.Rnw 2013-08-18 04:08:57 UTC (rev 1842)
+++ www/JSS/glmer.Rnw 2013-08-22 21:45:39 UTC (rev 1843)
@@ -5,13 +5,15 @@
\newcommand{\scw}[1]{{\color{blue} \emph{#1}}}
\usepackage[american]{babel} %% for texi2dvi ~ bug
\usepackage{bm,amsmath,thumbpdf,amsfonts}
-\author{Douglas Bates\\University of Wisconsin - Madison\And
- Martin M\"achler\\ETH Zurich\And
- Ben Bolker\\McMaster University\And
- Steven C. Walker\\McMaster University
+\author{
+ Steven C. Walker\\McMaster University \And
+ Rune Haubo Bojesen Christensen\\Technical University of Denmark\AND
+ Douglas Bates\\University of Wisconsin - Madison \And
+ Ben Bolker\\McMaster University \AND
+ Martin M\"achler\\ETH Zurich
}
-\Plainauthor{Douglas Bates, Martin M\"achler, Ben Bolker, Steve Walker}
-\title{Fitting generalized \bmb{and nonlinear?} mixed-effects models using \pkg{lme4}}
+\Plainauthor{Steve Walker, Rune Haubo Bojesen Christensen, Douglas Bates, Martin M\"achler, Ben Bolker}
+\title{Fitting generalized liner mixed-effects models using \pkg{lme4}}
\Plaintitle{Fitting generalized linear mixed models using lme4}
\Shorttitle{GLMMs with lme4}
\Abstract{%
@@ -23,6 +25,20 @@
penalized least squares,
Cholesky decomposition}
\Address{
+ Steven C. Walker\\
+ Department of Mathematics \& Statistics \\
+ McMaster University \\
+ 1280 Main Street W \\
+ Hamilton, ON L8S 4K1, Canada \\
+ E-mail: \email{scwalker at math.mcmaster.ca}
+ \par\bigskip
+ Rune Haubo Bojesen Christensen \\
+ Technical University of Denmark \\
+ Matematiktorvet \\
+ Building 324, room 220 \\
+ 2800 Kgs. Lyngby \\
+ E-mail: \email{rhbc at dtu.dk}
+ \par\bigskip
Douglas Bates\\
Department of Statistics, University of Wisconsin\\
1300 University Ave.\\
@@ -42,13 +58,6 @@
1280 Main Street W \\
Hamilton, ON L8S 4K1, Canada \\
E-mail: \email{bolker at mcmaster.ca}
- \par\bigskip
- Steven C. Walker\\
- Department of Mathematics \& Statistics \\
- McMaster University \\
- 1280 Main Street W \\
- Hamilton, ON L8S 4K1, Canada \\
- E-mail: \email{scwalker at math.mcmaster.ca }
}
\newcommand{\Var}{\operatorname{Var}}
\newcommand{\abs}{\operatorname{abs}}
@@ -455,6 +464,35 @@
shows $t(z)$ for each of the unidimensional integrals in the
likelihood for the model \code{m1} at the parameter estimates.
+\section{Examples}
+
+<<cbpp>>=
+head(cbpp)
+@
+<<cbppPlot>>=
+library(ggplot2)
+ggplot(cbpp) + facet_wrap(~herd) +
+ geom_point(aes(period, incidence/size, size = size)) +
+ scale_y_continuous(limits = c(0, 1))
+boxplot(incidence/size ~ period, data = cbpp, las = 1,
+ xlab = 'Period', ylab = 'Probability of sero-positivity')
+@
+To account for repeated measures
+<<cbppModelI>>=
+(gm1 <- glmer(incidence/size ~ period + (1 | herd), family = binomial,
+ data = cbpp, weights = size))
+summary(gm1)
+@
+<<cbppConfidence>>=
+#profile.gm1 <- profile(gm1)
+#confint(gm1)
+#set.seed(1)
+#boot.gm1 <- bootMer(gm1, function(x) getME(x, 'theta'), nsim = 100)
+#quantile(boot.gm1$t, probs = c(0.025, 0.975))
+@
+
+
+
\bibliography{lmer}
\end{document}
Modified: www/JSS/glmerAppendix.tex
===================================================================
--- www/JSS/glmerAppendix.tex 2013-08-18 04:08:57 UTC (rev 1842)
+++ www/JSS/glmerAppendix.tex 2013-08-22 21:45:39 UTC (rev 1843)
@@ -97,37 +97,38 @@
\begin{document}
%\SweaveOpts{concordance=TRUE}
-\section{Appendix: derivation of PIRLS (nAGQ = 0 version)}
+\section{Appendix: derivation of PIRLS}
-We seek to maximize PDEV the unscaled conditional log-likelihood
-for a GLMM over the conditional modes, $\bm u$ and fixed effect
-coefficients, $\bm\beta$. This problem is very similar to maximizing
-the log-likelihood for a GLM, about which there is a large amount of
-work. The standard algorithm for dealing with this kind of problem is
-iteratively reweighted least squares (IRLS). Here we modify IRLS by
-incorporating a penalty term that accounts for variation in the random
-effects, which we call penalized iteratively reweighted least squares (PIRLS).
+We seek to maximize the unscaled conditional log-likelihood for a GLMM
+over the conditional modes, $\bm u$. This problem is very similar to
+maximizing the log-likelihood for a GLM, about which there is a large
+amount of work. The standard algorithm for dealing with this kind of
+problem is iteratively reweighted least squares (IRLS). Here we modify
+IRLS by incorporating a penalty term that accounts for variation in
+the random effects, which we call penalized iteratively reweighted
+least squares (PIRLS).
The unscaled conditional log-likelihood takes the following form,
\begin{equation}
-g(\bm u ; \bm y, \bm\beta, \bm\theta) = \log p(\bm y, \bm u | \bm\beta, \bm\theta) =
-\bm\psi^\top \bm A \bm y -
-\bm a^\top \bm \phi +
-\bm c -
-\frac{1}{2}\bm u^\top \bm u -
-\frac{q}{2}\log{2\pi}
+ f(\bm u) = \log p(\bm y, \bm u | \bm\beta, \bm\theta) =
+ \bm\psi^\top \bm A \bm y -
+ \bm a^\top \bm \phi +
+ \bm c -
+ \frac{1}{2}\bm u^\top \bm u -
+ \frac{q}{2}\log{2\pi}
\end{equation}
-where $\bm\psi$ is the $n$-by-$1$ canonical parameter of an exponential family,
-$\bm\phi$ is the $n$-by-$1$ vector of cumulant functions, $\bm c$ an
-$n$-by-$1$ vector required for the log-likelihood to be based on a
-true probability distribution, and $\bm A$ is an $n$-by-$n$ diagonal
-matrix of prior weights, $\bm a$. Both $\bm a$ and $\bm c$ could depend on a dispersion
-parameter, although we ignore this possibility for now.
+where $\bm\psi$ is the $n$-by-$1$ canonical parameter of an
+exponential family, $\bm\phi$ is the $n$-by-$1$ vector of cumulant
+functions, $\bm c$ an $n$-by-$1$ vector required for the
+log-likelihood to be based on a true probability distribution, and
+$\bm A$ is an $n$-by-$n$ diagonal matrix of prior weights, $\bm
+a$. Both $\bm a$ and $\bm c$ could depend on a dispersion parameter,
+although we ignore this possibility for now.
The canonical parameter, $\bm\psi$, and cumulant function, $\bm\phi$,
depend on a linear predictor,
\begin{equation}
-\bm\eta = \bm o + \bm X \bm\beta + \bm Z \bm\Lambda_\theta \bm u
+ \bm\eta = \bm o + \bm X \bm\beta + \bm Z \bm\Lambda_\theta \bm u
\end{equation}
where $\bm o$ is an \emph{a priori} offset. The specific form of this
dependency is specified by the choice of the exponential family
@@ -141,8 +142,8 @@
for the Laplace approximation and adaptive Gauss-Hermite
quadrature. To do this maximization we use a variant of the Fisher
scoring method, which is the basis of the iteratively reweighted least
-squares algorithm for generalized linear models. Fisher scoring is itself
-based on Newton's method, which we apply first.
+squares algorithm for generalized linear models. Fisher scoring is
+itself based on Newton's method, which we apply first.
\subsection{Newton's method}
@@ -150,21 +151,22 @@
unscaled conditional log-likelihood. Following standard GLM theory
(e.g. McCullagh and Nelder 1989), we use the chain rule,
\begin{displaymath}
-\frac{d L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u} =
-\frac{d L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm\psi}
-\frac{d \bm\psi}{d \bm\mu}
-\frac{d \bm\mu}{d \bm\eta}
-\frac{d \bm\eta}{d \bm u}
+ \frac{d L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u} =
+ \frac{d L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm\psi}
+ \frac{d \bm\psi}{d \bm\mu}
+ \frac{d \bm\mu}{d \bm\eta}
+ \frac{d \bm\eta}{d \bm u}
\end{displaymath}
-The first derivative in this chain follow from basic results in GLM theory,
+The first derivative in this chain follow from basic results in GLM
+theory,
\begin{displaymath}
-\frac{d L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm\psi} =
-(\bm y - \bm\mu)^\top \bm A
+ \frac{d L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm\psi} =
+ (\bm y - \bm\mu)^\top \bm A
\end{displaymath}
-Again from standard GLM theory, the next two derivatives define the inverse diagonal variance
-matrix,
+Again from standard GLM theory, the next two derivatives define the
+inverse diagonal variance matrix,
\begin{displaymath}
-\frac{d \bm\psi}{d \bm\mu} = \bm V^{-1}
+ \frac{d \bm\psi}{d \bm\mu} = \bm V^{-1}
\end{displaymath}
and the diagonal Jaccobian matrix,
\begin{displaymath}
@@ -172,16 +174,16 @@
\end{displaymath}
Finally, because $\bm\beta$ affects $\bm\eta$ only linearly,
\begin{displaymath}
-\frac{d \bm\eta}{d \bm u} = \bm Z \bm\Lambda_\theta
+ \frac{d \bm\eta}{d \bm u} = \bm Z \bm\Lambda_\theta
\end{displaymath}
Therefore we have,
\begin{equation}
-\frac{d L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u} =
-(\bm y - \bm\mu)^\top \bm A
-\bm V^{-1}
-\bm M
-\bm Z \bm\Lambda_\theta +
-\bm u^\top
+ \frac{d L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u} =
+ (\bm y - \bm\mu)^\top \bm A
+ \bm V^{-1}
+ \bm M
+ \bm Z \bm\Lambda_\theta +
+ \bm u^\top
\label{eq:dPDEVdu}
\end{equation}
This is very similar to the gradient for GLMs with respect to fixed
@@ -191,46 +193,47 @@
Again we apply the chain rule to take the Hessian,
\begin{equation}
-\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
-\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm\mu}
-\frac{d \bm\mu}{d \bm\eta}
-\frac{d \bm\eta}{d \bm u} + \bm I_q
+ \frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
+ \frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm\mu}
+ \frac{d \bm\mu}{d \bm\eta}
+ \frac{d \bm\eta}{d \bm u} + \bm I_q
\end{equation}
which leads to,
\begin{equation}
-\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
-\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm\mu}\bm
-M \bm Z \bm\Lambda_\theta
- + \bm I_q
+ \frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
+ \frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm\mu}\bm
+ M \bm Z \bm\Lambda_\theta
+ + \bm I_q
\end{equation}
The first derivative in this chain can be expressed as,
\begin{equation}
-\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm\mu} =
--\bm\Lambda_\theta^\top \bm Z^\top \bm M \bm V^{-1} \bm A +
-\bm\Lambda_\theta^\top \bm Z^\top \left[ \frac{d \bm M \bm V^{-1}}{d \bm\mu} \right] \bm A \bm R
+ \frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm\mu} =
+ -\bm\Lambda_\theta^\top \bm Z^\top \bm M \bm V^{-1} \bm A +
+ \bm\Lambda_\theta^\top \bm Z^\top \left[ \frac{d \bm M \bm V^{-1}}{d \bm\mu} \right] \bm A \bm R
\end{equation}
-where $\bm R$ is a diagonal residuals matrix with $\bm y-\bm\mu$ on the
-diagonal. The two terms arise from a type of product rule, where we
-first differentiate the residuals, $\bm y-\bm\mu$, and then the diagonal
-matrix, $\bm M \bm V^{-1}$, with respect to $\bm\mu$.
+where $\bm R$ is a diagonal residuals matrix with $\bm y-\bm\mu$ on
+the diagonal. The two terms arise from a type of product rule, where
+we first differentiate the residuals, $\bm y-\bm\mu$, and then the
+diagonal matrix, $\bm M \bm V^{-1}$, with respect to $\bm\mu$.
The Hessian can therefore be expressed as,
\begin{equation}
-\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
--\bm \Lambda_\theta^\top \bm Z^\top \bm M \bm A^{1/2}\bm V^{-1/2}\left(
-\bm I_n -
-\bm V \bm M^{-1}\left[ \frac{d \bm M \bm V^{-1}}{d \bm\mu} \right] \bm R
-\right) \bm V^{-1/2}\bm A^{1/2} \bm M \bm Z \bm\Lambda_\theta + \bm I_q
+ \frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
+ -\bm \Lambda_\theta^\top \bm Z^\top \bm M \bm A^{1/2}\bm V^{-1/2}\left(
+ \bm I_n -
+ \bm V \bm M^{-1}\left[ \frac{d \bm M \bm V^{-1}}{d \bm\mu} \right] \bm R
+ \right) \bm V^{-1/2}\bm A^{1/2} \bm M \bm Z \bm\Lambda_\theta + \bm I_q
\label{eq:betaHessian}
\end{equation}
This result can be simplified by expressing it in terms of a weighted
-random-effects design matrix, $\bm U = \bm A^{1/2}\bm V^{-1/2}\bm M \bm Z \bm\Lambda_\theta$,
+random-effects design matrix, $\bm U = \bm A^{1/2}\bm V^{-1/2}\bm M
+\bm Z \bm\Lambda_\theta$,
\begin{equation}
-\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
--\bm U^\top\left(
-\bm I_n -
-\bm V \bm M^{-1}\left[ \frac{d \bm V^{-1}\bm M}{d \bm\mu} \right] \bm R
-\right) \bm U + \bm I_q
+ \frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
+ -\bm U^\top\left(
+ \bm I_n -
+ \bm V \bm M^{-1}\left[ \frac{d \bm V^{-1}\bm M}{d \bm\mu} \right] \bm R
+ \right) \bm U + \bm I_q
\label{eq:betaHessian}
\end{equation}
@@ -238,30 +241,30 @@
-There are two ways to further simplify this expression for $\bm U^\top \bm U$. The
-first is to use the canonical link function for the family being
-used. Canonical links have the property that $\bm V = \bm M$, which means
-that for canonical links,
+There are two ways to further simplify this expression for $\bm U^\top
+\bm U$. The first is to use the canonical link function for the family
+being used. Canonical links have the property that $\bm V = \bm M$,
+which means that for canonical links,
\begin{equation}
-\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
--\bm U^\top\left(
-\bm I_n -
-\bm I_n \left[ \frac{d \bm I_n}{d \bm\mu} \right] \bm R
-\right) \bm U + \bm I_q = \bm U^\top \bm U + \bm I_q
+ \frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm u} =
+ -\bm U^\top\left(
+ \bm I_n -
+ \bm I_n \left[ \frac{d \bm I_n}{d \bm\mu} \right] \bm R
+ \right) \bm U + \bm I_q = \bm U^\top \bm U + \bm I_q
\end{equation}
The second way to simplify the Hessian is to take its expectation with
-respect to the distribution of the response, conditional on the current
-values of the spherized random effects coefficients, $\bm u$. The
-diagonal residual matrix, $\bm R$, has expectation 0. Therefore,
-because the response only enters into the expression for the Hessian
-via $\bm R$, we have that,
+respect to the distribution of the response, conditional on the
+current values of the spherized random effects coefficients, $\bm
+u$. The diagonal residual matrix, $\bm R$, has expectation
+0. Therefore, because the response only enters into the expression for
+the Hessian via $\bm R$, we have that,
\begin{equation}
-E\left(\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm
- u} | \bm u \right) =
--\bm U^\top\left(
-\bm I_n -
-\bm U \bm M^{-1}\left[ \frac{d \bm V^{-1}\bm M}{d\mu} \right] E(\bm R)
-\right) \bm U + \bm I_q = \bm U^\top \bm U + \bm I_q
+ E\left(\frac{d^2 L(\bm\beta, \bm\theta | \bm y, \bm u)}{d \bm u d \bm
+ u} | \bm u \right) =
+ -\bm U^\top\left(
+ \bm I_n -
+ \bm U \bm M^{-1}\left[ \frac{d \bm V^{-1}\bm M}{d\mu} \right] E(\bm R)
+ \right) \bm U + \bm I_q = \bm U^\top \bm U + \bm I_q
\label{eq:betaHessian}
\end{equation}
@@ -279,42 +282,42 @@
\end{itemize}
\item \textbf{Stationary points of PWRSS}
\begin{itemize}
- \item Find the stationary points of the PWRSS criterion.
+ \item Find the stationary points of the PWRSS criterion.
\item These first two steps show that the PDEV and PWRSS criteria
have the same stationary points, \emph{if} the chosen weights
used in PWRSS correspond to the prior weights divided by the
variance function at the minimum of PDEV.
- \item Therefore, if we minimize PWRSS with a correct guess at
- the variance function, then we also minimize PDEV.
- \item However, because this correct variance function is not known \emph{a priori},
- we iteratively solve PWRSS, updating the variance
- function at each iteration.
+ \item Therefore, if we minimize PWRSS with a correct guess at the
+ variance function, then we also minimize PDEV.
+ \item However, because this correct variance function is not known
+ \emph{a priori}, we iteratively solve PWRSS, updating the
+ variance function at each iteration.
\end{itemize}
\item \textbf{Pseudo-data and weighted residuals}
\begin{itemize}
- \item Express the PWRSS as a simple residual sum of squares using
- pseudo-data and weighted residuals.
- \item This expresses the problem as a standard non-linear least-squares
- problem, which can be solved iteratively using the Gauss-Newton
- method~\citep[\S2.2.3]{bateswatts88:_nonlin}.
+ \item Express the PWRSS as a simple residual sum of squares using
+ pseudo-data and weighted residuals.
+ \item This expresses the problem as a standard non-linear
+ least-squares problem, which can be solved iteratively using the
+ Gauss-Newton method~\citep[\S2.2.3]{bateswatts88:_nonlin}.
\end{itemize}
\item \textbf{Gauss-Newton}
\begin{itemize}
- \item Apply the Gauss-Newton method to the non-linear
- least-squares problem.
- \item This yields a sequence of normal equations.
- \item If the solutions of this sequence converge, then they
- converge to the minimizer of PDEV, \textbf{(this is the step I'm
- unsure of)}, beacuse PDEV and PWRSS have the same stationary
- points at convergence.
+ \item Apply the Gauss-Newton method to the non-linear least-squares
+ problem.
+ \item This yields a sequence of normal equations.
+ \item If the solutions of this sequence converge, then they converge
+ to the minimizer of PDEV, \textbf{(this is the step I'm unsure
+ of)}, beacuse PDEV and PWRSS have the same stationary points at
+ convergence.
\end{itemize}
\item \textbf{Cholesky}
\begin{itemize}
- \item Solve the normal equations using block Cholesky
- decompositions.
- \item These particular Cholesky decompositions exploit the sparsity
- of some of the blocks of the cross-product matrices in the
- normal equations.
+ \item Solve the normal equations using block Cholesky
+ decompositions.
+ \item These particular Cholesky decompositions exploit the sparsity
+ of some of the blocks of the cross-product matrices in the normal
+ equations.
\end{itemize}
\end{enumerate}
@@ -357,19 +360,19 @@
\end{equation}
Similarly, differentiating with respect to $\bm u$ we have,
\begin{equation}
-\frac{d PDEV}{d \bm u} =
--2(\bm y - \bm\mu)^\top \bm A
-\bm V^{-1}
-\bm M
-\bm Z \bm\Lambda_\theta +
-2\bm u^\top
-\label{eq:dPDEVdu}
+ \frac{d PDEV}{d \bm u} =
+ -2(\bm y - \bm\mu)^\top \bm A
+ \bm V^{-1}
+ \bm M
+ \bm Z \bm\Lambda_\theta +
+ 2\bm u^\top
+ \label{eq:dPDEVdu}
\end{equation}
Again as in McCullagh and Nelder (1989), instead of trying to find the
roots of these equations directly, we use iterative least-squares
methods that converge on these roots. In particular, we define a
-penalized weighted residual sum-of-squares criterion (PWRSS) with the same
-roots as PDEV.
+penalized weighted residual sum-of-squares criterion (PWRSS) with the
+same roots as PDEV.
\subsection{Hessian of PDEV}
@@ -377,7 +380,7 @@
vectors of length $n$ and $m$ and $A$ be a diagonal matrix with
$n$-vector $a$ on the diagonal. Then,
\begin{equation}
-\frac{dAx}{dy} = x^\top \frac{da}{dy} + a^\top \frac{dx}{dy}
+ \frac{dAx}{dy} = x^\top \frac{da}{dy} + a^\top \frac{dx}{dy}
\end{equation}
\begin{equation}
@@ -419,26 +422,26 @@
There are two ways to simplify this expression for $V^\top V$. The
first is to use the canonical link function for the family being
-used. Canonical links have the property that $V = M$, which means
-that for canonical links,
+used. Canonical links have the property that $V = M$, which means that
+for canonical links,
\begin{equation}
-\underbrace{\frac{d^2 L}{d\beta d\beta}}_{p\times p} =
--V^\top\left(
-I_n -
-I_n \left[ \frac{dI_n}{d\mu} \right] R
-\right)V = V^\top V
+ \underbrace{\frac{d^2 L}{d\beta d\beta}}_{p\times p} =
+ -V^\top\left(
+ I_n -
+ I_n \left[ \frac{dI_n}{d\mu} \right] R
+ \right)V = V^\top V
\end{equation}
The second way to simplify the Hessian is to take its expectation with
-respect to the distribution of the response at the current
-parameter estimates. The diagonal residual matrix, $R$, has
-expectation 0. Therefore, because the response only enters into the
-expression for the Hessian via $R$, we have that,
+respect to the distribution of the response at the current parameter
+estimates. The diagonal residual matrix, $R$, has expectation
+0. Therefore, because the response only enters into the expression for
+the Hessian via $R$, we have that,
\begin{equation}
-\underbrace{E\left(\frac{d^2 L}{d\beta d\beta}\right)}_{p\times p} =
--V^\top\left(
-I_n -
-VM^{-1}\left[ \frac{dV^{-1}M}{d\mu} \right] E(R)
-\right)V = V^\top V
+ \underbrace{E\left(\frac{d^2 L}{d\beta d\beta}\right)}_{p\times p} =
+ -V^\top\left(
+ I_n -
+ VM^{-1}\left[ \frac{dV^{-1}M}{d\mu} \right] E(R)
+ \right)V = V^\top V
\label{eq:betaHessian}
\end{equation}
@@ -447,67 +450,69 @@
The penalized weighted residual sum of squares (PWRSS) is given by,
\begin{equation}
-\mathrm{PWRSS} = (\bm y-\bm\mu)^\top \bm W (\bm y-\bm\mu) + \bm u^\top
-\bm u
+ \mathrm{PWRSS} = (\bm y-\bm\mu)^\top \bm W (\bm y-\bm\mu) + \bm u^\top
+ \bm u
\label{eq:PWRSS}
\end{equation}
where the weights matrix $\bm W = \bm A \bm V^{-1}$. If we hold $\bm
-W$ fixed at an initial
-guess for the variance function, $\bm V = \bm V_0$, and differentiate with
-respect to $\bm\beta$ and $\bm u$ we obtain,
+W$ fixed at an initial guess for the variance function, $\bm V = \bm
+V_0$, and differentiate with respect to $\bm\beta$ and $\bm u$ we
+obtain,
\begin{equation}
-\frac{d \mathrm{PWRSS}}{d \bm\beta} =
--2 (\bm y - \bm\mu)^\top \bm A
-\bm V_0^{-1}
-\bm M
-\bm X
+ \frac{d \mathrm{PWRSS}}{d \bm\beta} =
+ -2 (\bm y - \bm\mu)^\top \bm A
+ \bm V_0^{-1}
+ \bm M
+ \bm X
\label{eq:dPWRSSdbeta}
\end{equation}
and,
\begin{equation}
-\frac{d \mathrm{PWRSS}}{d \bm u} =
--2 (\bm y - \bm\mu)^\top \bm A
-\bm V_0^{-1}
-\bm M
-\bm Z \bm\Lambda_\theta +
-2\bm u^\top
+ \frac{d \mathrm{PWRSS}}{d \bm u} =
+ -2 (\bm y - \bm\mu)^\top \bm A
+ \bm V_0^{-1}
+ \bm M
+ \bm Z \bm\Lambda_\theta +
+ 2\bm u^\top
\label{eq:dPWRSSdu}
\end{equation}
-Comparing Eqs. \ref{eq:dPWRSSdbeta} and \ref{eq:dPWRSSdu} with Eqs. \ref{eq:dPDEVdbeta} and \ref{eq:dPDEVdu} we see that if the intial guess at $\bm
-V$ is correct, then the two sets of equations have the same fixed
-points. Note that this result does not imply that PWRSS = PDEV. However, it does suggest an iterative scheme for finding the
-fixed effects parameters and the conditional modes. We use the Gauss-Newton
-method to derive such an iterative scheme. To do so, we use the techniques
-of pseudo-data and weighted residuals to express the problem of
-minimizing PWRSS as a non-linear least-squares problem.
+Comparing Eqs. \ref{eq:dPWRSSdbeta} and \ref{eq:dPWRSSdu} with
+Eqs. \ref{eq:dPDEVdbeta} and \ref{eq:dPDEVdu} we see that if the
+intial guess at $\bm V$ is correct, then the two sets of equations
+have the same fixed points. Note that this result does not imply that
+PWRSS = PDEV. However, it does suggest an iterative scheme for finding
+the fixed effects parameters and the conditional modes. We use the
+Gauss-Newton method to derive such an iterative scheme. To do so, we
+use the techniques of pseudo-data and weighted residuals to express
+the problem of minimizing PWRSS as a non-linear least-squares problem.
\subsection{Pseudo-data and weighted residuals}
-In Eq. \ref{eq:PWRSS} the response and its expectation are $\bm\mu$ and $\bm
-y$. Using the pseudo-data technique (standard in the mixed model
-literature) and the weighted residuals technique (standard in the
-weighted least-squares literature) we replace these vectors with,
+In Eq. \ref{eq:PWRSS} the response and its expectation are $\bm\mu$
+and $\bm y$. Using the pseudo-data technique (standard in the mixed
+model literature) and the weighted residuals technique (standard in
+the weighted least-squares literature) we replace these vectors with,
\begin{equation}
-\rho =
-\begin{bmatrix}
-\bm W^{1/2}\bm y \\
-\bm 0
-\end{bmatrix}
+ \rho =
+ \begin{bmatrix}
+ \bm W^{1/2}\bm y \\
+ \bm 0
+ \end{bmatrix}
\end{equation}
and fitted response,
\begin{equation}
-\nu =
-\begin{bmatrix}
-\bm W^{1/2}\bm \mu \\
-\bm u
-\end{bmatrix}
+ \nu =
+ \begin{bmatrix}
+ \bm W^{1/2}\bm \mu \\
+ \bm u
+ \end{bmatrix}
\end{equation}
The pseudo-data, $\bm 0$, and pseudo-mean, $\bm u$, allow us to remove
the penalty term in Eq. ??. By weighting $\bm y$ and $\bm\mu$ by the
square root of the weights matrix, we can remove the weights matrix
from the expression for PWRSS. In particular, we now have,
\begin{equation}
-\mathrm{PWRSS} = (\bm\rho - \bm\nu)^\top (\bm\rho - \bm\nu)
+ \mathrm{PWRSS} = (\bm\rho - \bm\nu)^\top (\bm\rho - \bm\nu)
\end{equation}
Minimizing PWRSS is a non-linear least-squares problem, because the
residuals $\bm\rho - \bm\nu$ depend non-linearly on $\bm\beta$ and
@@ -522,8 +527,8 @@
to both $\bm\beta$ and $\bm u$, under the condition that $\bm W = \bm
A \bm V^{-1}$ is fixed at $\bm W_0$,
\begin{displaymath}
-\frac{d \bm\nu}{d \bm\beta} =
-\begin{bmatrix}
+ \frac{d \bm\nu}{d \bm\beta} =
+ \begin{bmatrix}
\bm W_0^{1/2}\bm M \bm X \\
\bm 0
\end{bmatrix}
@@ -602,8 +607,8 @@
\bm 0
\end{bmatrix} -
\begin{bmatrix}
-\bm U_0 & V_0 \\
-\bm I_q & \bm 0
+ \bm U_0 & V_0 \\
+ \bm I_q & \bm 0
\end{bmatrix}
\begin{bmatrix}
\bm u \\
Modified: www/JSS/lmer.Rnw
===================================================================
--- www/JSS/lmer.Rnw 2013-08-18 04:08:57 UTC (rev 1842)
+++ www/JSS/lmer.Rnw 2013-08-22 21:45:39 UTC (rev 1843)
@@ -76,15 +76,19 @@
\newcommand{\yobs}{\ensuremath{\bm y_{\mathrm{obs}}}}
\newcommand*{\eq}[1]{eqn.~\ref{#1}}% or just {(\ref{#1})}
%
+
\SweaveOpts{engine=R,eps=FALSE,pdf=TRUE,width=10,height=6.5,strip.white=all}
\SweaveOpts{prefix=TRUE,prefix.string=figs/lmer,include=TRUE}
\SweaveOpts{keep.source=TRUE}
\setkeys{Gin}{width=\textwidth}
-<<preliminaries,echo=FALSE,results=hide>>=
+
+\begin{document}
+
+<<preliminaries>>=
options(width=69,show.signif.stars=FALSE,str=strOptions(strict.width="cut"))
library(lme4)
@
-\begin{document}
+
\linenumbers
\section{Introduction}
\label{sec:intro}
More information about the Lme4-commits
mailing list