[Rsiena-commits] r152 - in pkg: RSiena RSiena/man RSienaTest RSienaTest/R RSienaTest/doc RSienaTest/man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Sun May 29 20:42:29 CEST 2011


Author: jalospinoso
Date: 2011-05-29 20:42:29 +0200 (Sun, 29 May 2011)
New Revision: 152

Modified:
   pkg/RSiena/DESCRIPTION
   pkg/RSiena/changeLog
   pkg/RSiena/man/RSiena-package.Rd
   pkg/RSienaTest/DESCRIPTION
   pkg/RSienaTest/R/sienaGOF.r
   pkg/RSienaTest/changeLog
   pkg/RSienaTest/doc/s_man400.tex
   pkg/RSienaTest/man/RSiena-package.Rd
   pkg/RSienaTest/man/sienaGOF.Rd
Log:
152

Modified: pkg/RSiena/DESCRIPTION
===================================================================
--- pkg/RSiena/DESCRIPTION	2011-05-27 16:17:01 UTC (rev 151)
+++ pkg/RSiena/DESCRIPTION	2011-05-29 18:42:29 UTC (rev 152)
@@ -1,8 +1,8 @@
 Package: RSiena
 Type: Package
 Title: Siena - Simulation Investigation for Empirical Network Analysis
-Version: 1.0.12.151
-Date: 2011-05-27
+Version: 1.0.12.152
+Date: 2011-05-29
 Author: Various
 Depends: R (>= 2.10.0), xtable
 Imports: Matrix

Modified: pkg/RSiena/changeLog
===================================================================
--- pkg/RSiena/changeLog	2011-05-27 16:17:01 UTC (rev 151)
+++ pkg/RSiena/changeLog	2011-05-29 18:42:29 UTC (rev 152)
@@ -1,3 +1,7 @@
+2011-05-29 R-forge revision 152 RSienaTest only
+
+	* R/sienaGOF.r: Fixed a few bugs,  doc/s_400.tex: updated the script
+
 2011-05-27 R-forge revision 151
 
 	* R/observationErrors.r: added various traps for vectors of length

Modified: pkg/RSiena/man/RSiena-package.Rd
===================================================================
--- pkg/RSiena/man/RSiena-package.Rd	2011-05-27 16:17:01 UTC (rev 151)
+++ pkg/RSiena/man/RSiena-package.Rd	2011-05-29 18:42:29 UTC (rev 152)
@@ -30,8 +30,8 @@
 \tabular{ll}{
 Package: \tab RSiena\cr
 Type: \tab Package\cr
-Version: \tab 1.0.12.151\cr
-Date: \tab 2011-05-27\cr
+Version: \tab 1.0.12.152\cr
+Date: \tab 2011-05-29\cr
 License: \tab GPL-2 \cr
 LazyLoad: \tab yes\cr
 }

Modified: pkg/RSienaTest/DESCRIPTION
===================================================================
--- pkg/RSienaTest/DESCRIPTION	2011-05-27 16:17:01 UTC (rev 151)
+++ pkg/RSienaTest/DESCRIPTION	2011-05-29 18:42:29 UTC (rev 152)
@@ -1,8 +1,8 @@
 Package: RSienaTest
 Type: Package
 Title: Siena - Simulation Investigation for Empirical Network Analysis
-Version: 1.0.12.151
-Date: 2011-05-27
+Version: 1.0.12.152
+Date: 2011-05-29
 Author: Various
 Depends: R (>= 2.10.0), xtable
 Imports: Matrix

Modified: pkg/RSienaTest/R/sienaGOF.r
===================================================================
--- pkg/RSienaTest/R/sienaGOF.r	2011-05-27 16:17:01 UTC (rev 151)
+++ pkg/RSienaTest/R/sienaGOF.r	2011-05-29 18:42:29 UTC (rev 152)
@@ -68,7 +68,8 @@
 	}
 	if (is.null(wave) )
 	{
-		wave <- 1:(dim(sienaFitObject$f[[groupName]]$depvars[[varName]])[3] - 1)
+		wave <- 1:(attr(sienaFitObject$f[[groupName]]$depvars[[varName]], 
+						"netdims")[3] - 1)
 	}
 	if (varNumber < 1 || varNumber >
 			length(sienaFitObject$sims[[1]][[groupNumber]]))
@@ -76,8 +77,8 @@
 		stop("Invalid variable number -- out of bounds.")
 	}
 	if (min(wave) < 1 || max(wave) >
-			dim(sienaFitObject$f[[groupName]]$depvars[[varName]])[3] - 1
-		)
+			attr(sienaFitObject$f[[groupName]]$depvars[[varName]], 
+							"netdims")[3] - 1)
 	{
 		stop("Invalid wave index -- out of bounds")
 	}
@@ -86,7 +87,9 @@
 						matrix(
 						auxiliaryFunction(NULL,
 								sienaFitObject$f, sienaFitObject$sims,
-								groupName, varName, j), nrow=1) })
+								groupName, varName, j)
+						, nrow=1) 
+				})
 	if (join)
 	{
 		obsStats <- Reduce("+", obsStatsByWave)
@@ -194,7 +197,7 @@
 		}
 		ainv <- ginv(a)
 		arank <- rankMatrix(a)
-		expectation <- apply(simulated, 2, mean);
+		expectation <- colMeans(simulated);
 		centeredSimulations <- scale(simulated, scale=FALSE)
 		if (variates==1)
 		{
@@ -250,7 +253,7 @@
 	obsMhd <- NULL
 
 	ExpStat <- lapply(wave, function(i) {
-				apply(simStatsByWave[[i]], 2, mean)
+				colMeans(simStatsByWave[[i]])
 			})
 	OneStepSpecs <- matrix(0, ncol=sum(sienaFitObject$test),
 			nrow=length(sienaFitObject$theta))
@@ -341,11 +344,9 @@
 							sigma, fra, doTests,
 							maxlike=sienaFitObject$maxlike)$oneStep )
 			mmPartialThetaDelta <- rep(0,length(theta0))
-			mmPartialThetaDelta[length(theta0)] <-
-					mmThetaDelta[length(theta0)]
+			mmPartialThetaDelta[length(theta0)] <- mmThetaDelta[length(theta0)]
 			JacobianExpStat <- lapply(wave, function (i) {
-				t(SF[,i,]) %*% simStatsByWave[[i]] / nSims
-					})
+				t(SF[,i,]) %*% simStatsByWave[[i]]/ nSims  }) 
 			Gradient <- lapply(wave, function(i) {
 						-2  * JacobianExpStat[[i]] %*%
 								covInvByWave[[i]] %*%
@@ -669,11 +670,13 @@
 
 sparseMatrixExtraction <- function (i, data, sims, groupName, varName, wave) {
 	#require(Matrix)
-	dimsOfDepVar=dim(data[[groupName]]$depvars[[varName]][,,wave])
-	missing <- Matrix(is.na(data[[groupName]]$depvars[[varName]][,,wave])*1)
+	dimsOfDepVar<- 
+			attr(data[[groupName]]$depvars[[varName]],
+					"netdims")
+	missing <- Matrix(is.na(data[[groupName]]$depvars[[varName]][,,wave+1])*1)
 	if (is.null(i)) {
 		# sienaGOF wants the observation:
-		returnValue <- Matrix(data[[groupName]]$depvars[[varName]][,,wave])
+		returnValue <- Matrix(data[[groupName]]$depvars[[varName]][,,wave+1])
 		returnValue[is.na(returnValue)] <- 0
 	}
 	else
@@ -698,23 +701,23 @@
 
 snaSociomatrixExtraction <- function (i, data, sims, groupName, varName, wave) {
 	require(sna)
-	dimsOfDepVar=dim(data[[groupName]]$depvars[[varName]][,,wave])
-	missing <- as.sociomatrix.sna(
-		1*is.na(data[[groupName]]$depvars[[varName]][,,wave]))
-
+	actors <- attr(data[[groupName]]$nets[[varName]][[wave+1]]$mat1,
+			"nActors")
+	missing <- t(data[[groupName]]$nets[[varName]][[wave+1]]$mat1)
+	attr(missing, "n") <- actors
+	missing <- 1*is.na( as.sociomatrix.sna( missing ) )
+	
 	if (is.null(i)) {
 		# sienaGOF wants the observation:
-		returnValue <- data[[groupName]]$depvars[[varName]][,,wave]
-		returnValue <- as.sociomatrix.sna(returnValue)
-		returnValue[is.na(returnValue)] <- 0
+		returnValue <- t(data[[groupName]]$nets[[varName]][[wave+1]]$mat1)
 	}
 	else
 	{
 		#sienaGOF wants the i-th simulation:
 		returnValue <- sims[[i]][[groupName]][[varName]][[wave]]
-		attr(returnValue, "n") <- dimsOfDepVar[1]
-		returnValue <- as.sociomatrix.sna( returnValue )
 	}
+	attr(returnValue, "n") <- actors
+	returnValue <- as.sociomatrix.sna( returnValue )
 	returnValue = 1*((returnValue - missing) > 0)
 	returnValue
 }

Modified: pkg/RSienaTest/changeLog
===================================================================
--- pkg/RSienaTest/changeLog	2011-05-27 16:17:01 UTC (rev 151)
+++ pkg/RSienaTest/changeLog	2011-05-29 18:42:29 UTC (rev 152)
@@ -1,3 +1,7 @@
+2011-05-29 R-forge revision 152 RSienaTest only
+
+	* R/sienaGOF.r: Fixed a few bugs,  doc/s_400.tex: updated the script
+
 2011-05-27 R-forge revision 151
 
 	* R/observationErrors.r: added various traps for vectors of length

Modified: pkg/RSienaTest/doc/s_man400.tex
===================================================================
--- pkg/RSienaTest/doc/s_man400.tex	2011-05-27 16:17:01 UTC (rev 151)
+++ pkg/RSienaTest/doc/s_man400.tex	2011-05-29 18:42:29 UTC (rev 152)
@@ -3053,18 +3053,18 @@
 ################################################################################
 ## Model selection and RSiena								                   #
 ## Script prepared by Josh Lospinoso (stats.ox.ac.uk/~lospinos)                #
-## Date: 5/26/2011											                   #
-## Version: 3												                   #
+## Date: 5/29/2011											                   #
+## Version: 4												                   #
 ################################################################################
 
-# In this script, we will load the s50 data (type ?s50 for more information)
+# In this script, we will load the s50 data (type ?s50 for more information) 
 # and go through an iteration of forward model selection. The idea is to specify
-# a restricted model containing a conservative number of effects and then
-# evaluate evidence for inclusion of more elaborate terms. This approach
-# contrasts with backward model selection, which would entail inclusion of a
+# a restricted model containing a conservative number of effects and then 
+# evaluate evidence for inclusion of more elaborate terms. This approach 
+# contrasts with backward model selection, which would entail inclusion of a 
 # large number of terms and subsequent evaluation for omission of terms.
 
-# It is strongly emphasized here that theory should always guide model
+# It is strongly emphasized here that theory should always guide model 
 # selection! In this script, our intent is to illustrate some of the tools that
 # are available to you and not necessarily to provide a "cookie cutter" solution
 # to fitting SAOMs to all datasets:
@@ -3075,7 +3075,7 @@
 
 # First, we will install the experimental "RSienaTest" from R-Forge.
 # This version is updated more often than the RSiena package on CRAN.
-# Currently, one of the functions (sienaGOF) is not yet available in the
+# Currently, one of the functions (sienaGOF) is not yet available in the 
 # CRAN version.
 install.packages("RSienaTest", repos="http://R-Forge.R-project.org")
 
@@ -3087,83 +3087,73 @@
 # RSienaTest package:
 library(RSienaTest)
 
-# First, give some estimation settings. Here we will use 4 phase 2 subphases
+# First, give some estimation settings. Here we will use 4 phase 2 subphases 
 # and 1000 phase 3 iterations. These settings come generally recommended for
 # most datasets.
 estimationSettings <- sienaModelCreate(fn=simstats0c, nsub=4, n3=1000)
 
-# The s50 dataset is already loaded into the R environment once the
-# library(.) function is called. See e.g. ?s501 for information on
+# The s50 dataset is already loaded into the R environment once the 
+# library(.) function is called. See e.g. ?s501 for information on 
 # each data object.
 
 # This will load three waves of 50 actors into a sienaNet
 friendship <- sienaNet(array(c(s501, s502, s503), dim=c(50, 50, 3)))
-alcohol <- varCovar(s50a)
-s50data <- sienaDataCreate(friendship, alcohol)
+s50data <- sienaDataCreate(friendship)
 
 # Here we begin to build up a preliminary model selection. By default, this
 # model will have density and reciprocity effects included from getEffects(.)
 model1 <- getEffects(s50data)
+model1 <- includeEffects(model1, 
+		density,
+		recip,
+		transTrip, name="friendship")
 
-# Or with the include effects (preferred option):
+# Here, we can add a few score type tests for more elaborate transitive closure
+# terms. The choice of which cocktail of closure terms can have profound effects
+# on the global structures of the network.
+model1 <- setEffect(model1, cycle3, 
+		fix=TRUE, test=TRUE, include=TRUE)
+model1 <- setEffect(model1, nbrDist2, 
+		fix=TRUE, test=TRUE, include=TRUE)
 
-model1 <- includeEffects(model1, transTrip, cycle3, name="friendship")
-model1 <- includeEffects(model1, simX, interaction1="alcohol",
-	name="friendship")
-# Here, we can add a few score type tests for covariate terms (which can be
-# highly correlated). These tests will help us to determine whether we should
-# include more complicated selection effects in future iterations.
-model1 <- setEffect(model1, egoX, interaction1="alcohol", name="friendship",
-	fix=TRUE, test=TRUE, include=TRUE)
-model1 <- setEffect(model1, altX, interaction1="alcohol", name="friendship",
-	fix=TRUE, test=TRUE, include=TRUE)
-model1 <- setEffect(model1, higher, interaction1="alcohol", name="friendship",
-	fix=TRUE, test=TRUE, include=TRUE)
-
 # Check what is specified in model1:
 model1
 
-# We have chosen to start with two transitivity terms (transitive triplets
-# and three cycles), and one selection, one influence term for alcohol
-# consumption.
-
 # We use the siena07() function to estimate the parameters of model1
-# NOTE: you MUST specify returnDeps=TRUE in order to use the goodness of
+# NOTE: you MUST specify returnDeps=TRUE in order to use the goodness of 
 # fit testing functionality.
-( results1 <- siena07(estimationSettings, data=s50data, effects=model1,
+( results1 <- siena07(estimationSettings, data=s50data, effects=model1, 
 	batch=TRUE, returnDeps=TRUE) )
 
-# Next, we perform a test for time heterogeneity. This will help us see
+# Next, we perform a test for time heterogeneity. This will help us see 
 # if the parameters have shifted over time. It comes highly
 # recommended to perform this step before checking other goodness of fit!
 ( timetest1 <- sienaTimeTest(results1) )
-# With a p value of about .30, there is very little joint evidence against
+# With a p value of about .55, there is very little joint evidence against
 # the hypothesis of time homogeneity.
 
 # To get a more informal feel for how time heterogeneity may be present in
 # the data, we can plot the test for each effect:
-plot(timetest1, effects=1:4)
-plot(timetest1, effects=5:8)
+plot(timetest1, effects=1:5)
 # Gray bands indicate the base periods (or, alternately, time dummies that
 # have been estimated using siena07), and red bands indicate potential time
 # dummies. These bands are 95% confidence bands. Where they do not overlap,
 # there is (individual) evidence against the time homogeneity hypothesis.
 
 # There is some information on sienaTimeTest available in the help file,
-# where you can find the citation for Lospinoso et. al (2010)
+# where you can find the citation for Lospinoso et. al (2010) 
 # where the test is developed.
 ?sienaTimeTest
 
-# We can now check the score type test results for more involved
+# We can now check the score type test results for more involved 
 # selection terms. Summary of a siena07 object gives us a wealth
 # of information about estimation, but it will also give us a score-type
 # test of Schweinberger (2007) calculated for those effects which we have
 # denoted fix=TRUE, test=TRUE above:
 summary(results1)
 # In the section "Generalised score test <c>", we are presented with the
-# results. Since the joint p value is about .66, there is very little
-# evidence that these effects (alch alter, alch ego, higher alch)
-# are fit poorly with respect to their statistics.
+# results. It shows that, with a p value of (<.0001), the actors at distance
+# two effect should be included in the next model.
 
 # We have checked time heterogeneity and the goodness of fit of a few
 # alcohol-related statistics, but we perform one last check using the
@@ -3172,8 +3162,8 @@
 # with the model but that are not strictly fit by the parameters of the
 # model. There is a simple battery of auxiliary statistics that is already
 # implemented in RSiena: indegree and outdegree distributions, triad census,
-# and average nearest neighboor degree (KNN) distributions. First, let us
-# calculate these using our results:
+# and average nearest neighboor degree (KNN) distributions. First, let us 
+# calculate the following using our results:
 
 # - Indegree distribution ( # of nodes with indegree < A for A in 1:MAX)
 ( id1 <- sienaGOF(results1, IndegreeDistribution, verbose=TRUE) )
@@ -3181,12 +3171,60 @@
 # - Outdegree distribution ( # of nodes with outdegree < A for A in 1:MAX)
 ( od1 <- sienaGOF(results1, OutdegreeDistribution, verbose=TRUE) )
 
-# - Triad census (see ?triad.census in the sna package)
-( tc1 <- sienaGOF(results1, TriadCensus, verbose=TRUE) )
-
 # - Geodesic distribution  ( # of geodesces < A for A in 1:MAX)
 ( ge1 <- sienaGOF(results1, GeodesicDistribution, verbose=TRUE) )
 
+# We can also write our own custom functions. Here we take all of the transitive
+# triplets and intransitive triplets (omitting the vacuously transitive enumerations)
+# (See Wasserman and Faust 1994, p. 243):
+#
+# Transitive:
+# 030T a  -> b  -> c, a  -> c
+# 120D a  -> b <-> c, a  -> c
+# 120U a <-  b <-> c, a <-  c
+# 300  a <-> b <-> c, a <-> c
+#
+# Intransitive:
+# 021C a  -> b  -> c, a     c
+# 030C a  -> b  -> c, c  -> a
+# 111D a  -> b <-> c, a     c
+# 111U a <-  b <-> c, a     c
+# 120C a  -> b  -> c, a <-> c
+# 201  a <-> b <-> c, a     c
+# 210  a <-> b <-> c, a  -> c
+#
+# The triad.census function is available from the package "sna" (see ?triad.census)
+# It will return a vector of length 16 with the following configurations:
+#
+#1  "003"
+#2  "012"
+#3  "102"
+#4  "021D"
+#5  "021U"
+#6  "021C"
+#7  "111D"
+#8  "111U"
+#9  "030T"
+#10 "030C"
+#11 "201"
+#12 "120D"
+#13 "120U"
+#14 "120C"
+#15 "210"
+#16 "300"
+#
+# So, we wish to screen out the transitive triplets c(9,12,13,16)
+# and the intransitive triplets c(6,10,7,8,14,11,15)
+#
+CustomTriadCensus <- function(i, data, sims, groupName, varName, wave,
+		extractor=snaEdgelistExtraction) {
+	require(sna)
+	x <- extractor(i, data, sims, groupName, varName, wave)
+	triad.census(x)[6:16]
+}
+# Execute the test:
+( tc1 <- sienaGOF(results1, CustomTriadCensus, verbose=TRUE) )
+
 # For now, we will not worry about the contents of the text output for these
 # objects, but we will seek to get an intuitive feel for how well these features
 # are fitting by using the plot.sienaGOF functionality:
@@ -3200,62 +3238,42 @@
 # of Lospinoso and Snijders (2011) (see ?sienaGOF for more information).
 # The null hypothesis for this p value is that the auxiliary statistics
 # for the observed data are distributed according to the statistics shown in
-# the plot. For id1, we see that the p value is about .35, so we are left
+# the plot. For id1, we see that the p value is about .40, so we are left
 # with the impression that fit is quite good. (This can be seen visually also!)
 
 # Try outdegree also:
 plot(od1)
-# We see some marginal fit on the outdegree (p is .062), but the observations
-# are within the 95% bands. We will see what we can do to improve this fit.
+# We see good on the outdegree also (p is .48).
 
-# Triad census:
-plot(tc1)
-# How can we possibly read this? Use centering and scaling:
-plot(tc1, scale=TRUE, center=TRUE)
 # We can use the keys given in ?triad.classify of the sna package to help
-# us understand where fit is lacking:
-triad.keys <- c("003","012","102","021D","021U","021C","111D","111U",
-		"030T","030C","201","120D","120U","120C","210","300")
-# and add these to the plot:
-plot(tc1, scale=TRUE, center=TRUE, key = triad.keys)
+# us understand where fit is lacking on triad census:
+triad.keys <- c("021C","111D","111U","030T","030C","201","120D",
+		"120U","120C","210","300")
+plot(tc1, key = triad.keys)
 # The following triads are not being represented well in the model, based
 # on a 95% confidence interval cutoff:
-# 111U: a     b  -> c ; a <-> c  (Too many being simulated)
-# 030T: a  -> b <-  c ; a  -> c  (Too few being simulated)
-#  201: a  -> b <-> c ; a <-> c  (Too many being simulated)
+# 111U: a <-> b -> c ; a   c  (Too many being simulated)
 
 # And geodesic distance:
 plot(ge1)
-# The fit is marginal (p is roughly .09), but for a fairly simple model not
-# too bad. The observations are on the bottom of the confidence interval.
+# The fit is poor (p is less than 0.01)
+
+# The observations are on the bottom of the confidence interval.
 # Since there are a finite number of dyads in the graph, this means that the
 # simulations are too connected--that is, there are more geodesces
 # less than, say, 8 in the simulations than in the observations, so the
 # observed geodesces must be more frequent at higher numbers.
 
-# So far, we have reached two conclusions about goodness of fit:
-# 1) More involved alter selection effects do not seem necessary
-# 2) There is pretty good fit for such a simple model. We could try to improve
-# 	 fit on the geodesic distribution and a few of the triads.
-#
-# Our strategy is then to try fitting geodesic distances better. Let us add
-# actors at distance two:
-model2 <- includeEffects(model1, nbrDist2)
-# We also tidy up the effects by removing the tests for more involved selection
-# effects on alcohol:
-model2 <- setEffect(model2, egoX, interaction1="alcohol", name="friendship",
-		fix=FALSE, test=FALSE, include=FALSE)
-model2 <- setEffect(model2, altX, interaction1="alcohol", name="friendship",
-		fix=FALSE, test=FALSE, include=FALSE)
-model2 <- setEffect(model2, higher, interaction1="alcohol", name="friendship",
-		fix=FALSE, test=FALSE, include=FALSE)
-# Not wanting to make too many changes to the model at once, let us also try
-# some degree related effects to help our outdegree fit:
-model2 <- setEffect(model2, outPop, fix=TRUE, test=TRUE, include=TRUE)
-model2 <- setEffect(model2, outAct, fix=TRUE, test=TRUE, include=TRUE)
+# For now, let us follow the suggestsions from the score test of Schweinberger
+# and include the actors at distance two effect.
+model2 <- setEffect(model1, nbrDist2, fix=FALSE, test=FALSE, include=TRUE)
 
+# And add another candidate for improving triad census (three cycles is still
+# on the effects object):
+model2 <- setEffect(model2, balance, fix=TRUE, test=TRUE, include=TRUE)
+
 # Estimate our model (note the use of prevAns to speed up convergence):
-( results2 <- siena07(estimationSettings, data=s50data, effects=model2,
+( results2 <- siena07(estimationSettings, data=s50data, effects=model2, 
 					batch=TRUE, returnDeps=TRUE, prevAns=results1) )
 
 # Next, we perform a test for time heterogeneity:
@@ -3267,7 +3285,7 @@
 # distributions as well as triad census:
 ( id2 <- sienaGOF(results2, IndegreeDistribution, verbose=TRUE) )
 ( od2 <- sienaGOF(results2, OutdegreeDistribution, verbose=TRUE) )
-( tc2 <- sienaGOF(results2, TriadCensus, verbose=TRUE) )
+( tc2 <- sienaGOF(results2, CustomTriadCensus, verbose=TRUE) )
 ( ge2 <- sienaGOF(results2, GeodesicDistribution, verbose=TRUE) )
 # We will treat each of these in turn.
 
@@ -3275,16 +3293,10 @@
 summary(id2)
 # The summary output tells us information first about the Monte Carlo
 # Mahalanobis Distance goodness of fit test of Lospinoso and Snijders (2011).
-# The p value of .03 tells us that fit is marginal on indegree distribution.
-# This is a potentially frustrating aspect of testing GOF on auxiliary
-# statistics! There is no guarantee that adding more terms to a model will
-# not decrease fit! Recall that
-id1
-# showed good fit.
-# The next item to notice in
-summary(id2)
+# The p value of .40 tells us that fit is still good on indegree distribution.
+# The next item to notice in this summary
 # are the outdegree - popularity and outdegree - activity displays.
-# These two effects had test=TRUE selected on the effects object, so
+# These two effects had test=TRUE selected on the effects object, so 
 # sienaGOF treats these specially. It considers a polynomial approximation
 # to the Mahalanobis distance with respect to the parameters of the model
 # so that we can posit various specifications and compare their expected
@@ -3300,7 +3312,7 @@
 #
 # MM(Par.) -- For "method of moments  (partial)" corresponds to an estimate
 #	where we only take the one step estimate coordinate from MM(full) for
-#	the effect under consideration. E.g. for "outdegree - popularity",
+#	the effect under consideration. E.g. for "outdegree - popularity", 
 #	all parameters are equivalent to the current estimates, but we replace
 #	the 0 for outdegree - popularity with its one step estimate.
 #
@@ -3311,43 +3323,41 @@
 #	approximation for the Mahalanobis distance. It, by definition, will have
 #	a MHD less than both MM(Full) and MM(Par.).
 #
-#	Unfortunately, it appears that for both models under consideration, the
-#	Mahalanobis distance will increase. We must take this guidance with a heavy
-#	grain of salt for the reasons mentioned above; we recommend comparing the
-#	relative MHDs to guide selection. If we place some trust in the one step
-#	estimates (which seem fairly plausible for both models), we are led to
-#	select outdegree - popularity to help with indegree distribution. Indeed,
-# 	this selection tends to make theortical sense, since activity is synonymous
-# 	with indegree!
+# These estimates can give us some indication of which effects are best to includ
+# in our model, if we wish to reduce the distance between the observations and the 
+# simulations. Since outdegree and indegree distributions already fit quite well,
+# Because fit is sufficient for the indegree distribution, we do not need to do
+# much more analysis of this output.
 
 # For outdegree,
 summary(od2)
-# it seems also that outdegree - popularity is the best choice (although we note
-# that the fit is quite good on outdegree already!)
+# we likewise still have good fit.
 
 # For geodesic distances,
 summary(ge2)
-# we might also be led towards including outdegree popularity (although, again,
-# it seems that we have achieved quite good fit for geodesic distances with a
-# p value of .707)
+# we have achieved quite good fit by simply including actors at distance two.
 
 # For triad census,
 summary(tc2)
 # where fit is the worst across all of the statistics, outdegree popularity seems
-# to be the best candidate as well.
+# to be the best candidate, although the magnitude of these estimates cannot be
+# trusted (we take the ranking rather than the real value)
+plot(tc2, key = triad.keys)
+# Here we can use the suggestions of the MM one step estimates. It seems that
+# we can expect a decrease from about 58 to 52 if we include balance:
+model3 <- setEffect(model2, balance, fix=FALSE, test=FALSE, include=TRUE)
+# While three cycles has not yet had occasion to enter into our model, we continue
+# to use it as a candidate, since triad census is not fit well and the three cycles
+# is one way to directly fix one of the statistics (030C). Let us compare it
+# against dense triads (6), i.e. (300) as we consider model 4:
+model3 <- setEffect(model3, denseTriads, fix=TRUE, test=TRUE, include=TRUE)
+model3[model3$shortName=="denseTriads","parm"] <- 6
 
-# Model selection is an art, and thus there is no strictly superior criteria
-# for updating our model. In the current selection, it seems that the
-# outdegree popularity model has accumulated enough support across the four
-# tests that we decide to include it here:
-model3 <- setEffect(model2, outPop, fix=FALSE, test=FALSE, include=TRUE)
-model3 <- setEffect(model3, outAct, fix=FALSE, test=FALSE, include=FALSE)
-
-# Estimate our model
-( results3 <- siena07(estimationSettings, data=s50data, effects=model3,
+# Estimate our model 3
+( results3 <- siena07(estimationSettings, data=s50data, effects=model3, 
 					batch=TRUE, returnDeps=TRUE, prevAns=results2) )
-# Convergence could be improved:
-( results3 <- siena07(estimationSettings, data=s50data, effects=model3,
+# Convergence could be improved (convergence statistics should be <.1 in abs. val.):
+( results3 <- siena07(estimationSettings, data=s50data, effects=model3, 
 					batch=TRUE, returnDeps=TRUE, prevAns=results3) )
 
 # Time test
@@ -3357,43 +3367,52 @@
 # Check goodness of fit again:
 ( id3 <- sienaGOF(results3, IndegreeDistribution, verbose=TRUE) )
 ( od3 <- sienaGOF(results3, OutdegreeDistribution, verbose=TRUE) )
-( tc3 <- sienaGOF(results3, TriadCensus, verbose=TRUE) )
+( tc3 <- sienaGOF(results3, CustomTriadCensus, verbose=TRUE) )
 ( ge3 <- sienaGOF(results3, GeodesicDistribution, verbose=TRUE) )
 
-# It seems that the test for indegree distribution indicates evidence of
-# poor fit:
-id3
-# but when we plot it:
-plot(id3)
-# the fit seems quite good. This is an artifact of the default settings for our
-# IndegreeDistribution function. It seems that there are few actors with indegree
-# higher than, say, 6, so the non-normality of this data is causing problems.
-# We can define a custom function easily which restricts the support:
-CustomIndegreeDistribution <- function(...) {
-	IndegreeDistribution(..., levels=1:6)
-}
-# and run this test again:
-( id3 <- sienaGOF(results3, CustomIndegreeDistribution, verbose=TRUE) )
-plot(id3)
-# where now we have a test agreeing with the plot.
+# All of the fits are very good aside from a rather marginal fit for triad census:
+plot(tc3, key=triad.keys)
+# which have persisted throughout the two model updates we have proposed.
+summary(tc3)
+# The summary suggests that we consider denseTriads:
+model4 <- setEffect(model3, denseTriads, fix=FALSE, test=FALSE, include=TRUE)
+model4[model4$shortName=="denseTriads","parm"] <- 6
+# We are rapidly saturating the triadic configurations in our model. We take the 
+# opportunity to try degree related effects as an competing candidate to improve fit:
+model4 <- setEffect(model4, inPop, fix=TRUE, test=TRUE, include=TRUE)
 
-# The only other problems we seem to have are with triad census:
-plot(tc3, scale=TRUE, center=TRUE, key=triad.keys)
-# which have persisted throughout the two model updates we have proposed!
-# Graphically, the data is not too implausible, so we might take heed of
-# George Box's famous quote "All models are wrong, some are useful" and
-# proceed with our inference. If, however, triad census across the configurations
-# that seem to not fit well could have an impact on your inference, we strongly
-# recommend spending some more time trying new specifications.
+( results4 <- siena07(estimationSettings, data=s50data, effects=model4, 
+					batch=TRUE, returnDeps=TRUE, prevAns=results3) )
+# Check goodness of fit again:
+( id4 <- sienaGOF(results4, IndegreeDistribution, verbose=TRUE) )
+( od4 <- sienaGOF(results4, OutdegreeDistribution, verbose=TRUE) )
+( tc4 <- sienaGOF(results4, CustomTriadCensus, verbose=TRUE) )
+( ge4 <- sienaGOF(results4, GeodesicDistribution, verbose=TRUE) )
 
-# Geodesic distance distribution
-plot(ge3)
-# and outdegree
-plot(od3)
-# fit adequately.
+# Fit seems adequate across all of the statistics.
+#
+# Continuing along in this fashion, it is possible to improve fit even more, although
+# we again stress that theory should be the principle reason for effect additions.
+# This model has very nice fit, and was achieved after a few more iterations.
+model5 <- includeEffects(
+		getEffects(s50data), 
+		balance,
+		inPop,
+		outPop,
+		cycle3,
+		denseTriads,
+		transTrip,
+		nbrDist2, 
+		name="friendship")
 
-# Now that goodness of fit has been addressed, and we have settled in on a model,
-# we can continue on drawing inference on the sustantive hypotheses of interest.
+model5[model5$shortName=="denseTriads","parm"] <- 6
+
+( results5 <- siena07(estimationSettings, data=s50data, effects=model5, 
+					batch=TRUE, returnDeps=TRUE, prevAns=results4) )
+( id5 <- sienaGOF(results5, IndegreeDistribution, verbose=TRUE) )
+( od5 <- sienaGOF(results5, OutdegreeDistribution, verbose=TRUE) )
+( tc5 <- sienaGOF(results5, CustomTriadCensus, verbose=TRUE) )
+( ge5 <- sienaGOF(results5, GeodesicDistribution, verbose=TRUE) )
 \end{verbatim}
 
 \newpage

Modified: pkg/RSienaTest/man/RSiena-package.Rd
===================================================================
--- pkg/RSienaTest/man/RSiena-package.Rd	2011-05-27 16:17:01 UTC (rev 151)
+++ pkg/RSienaTest/man/RSiena-package.Rd	2011-05-29 18:42:29 UTC (rev 152)
@@ -30,8 +30,8 @@
 \tabular{ll}{
 Package: \tab RSiena\cr
 Type: \tab Package\cr
-Version: \tab 1.0.12.151\cr
-Date: \tab 2011-05-27\cr
+Version: \tab 1.0.12.152\cr
+Date: \tab 2011-05-29\cr
 License: \tab GPL-2 \cr
 LazyLoad: \tab yes\cr
 }

Modified: pkg/RSienaTest/man/sienaGOF.Rd
===================================================================
--- pkg/RSienaTest/man/sienaGOF.Rd	2011-05-27 16:17:01 UTC (rev 151)
+++ pkg/RSienaTest/man/sienaGOF.Rd	2011-05-29 18:42:29 UTC (rev 152)
@@ -175,8 +175,18 @@
 	myeff[myeff$shortName=='cycle3', c('fix', 'test')] <-TRUE
 	myeff[myeff$shortName=='balance', c('fix', 'test')] <-TRUE
 	ans <- siena07(mymodel, data=mydata, effects=myeff, returnDeps=TRUE, batch=TRUE)
-	( res <- sienaGOF(ans, KnnDistribution, cumulative=TRUE) )
+	( res <- sienaGOF(ans, KnnDistribution) )
 	plot(res)
+	
+	## We can also make a custom function:
+	#
+	CustomTriadCensus <- function(...) {
+		require(sna)
+		x <- snaEdgelistExtraction(...)
+		triad.census(x)[6:16]
+	}
+	# Execute the test:
+	( res <- sienaGOF(ans, CustomTriadCensus) )	
 }
 }
 \keyword{models}



More information about the Rsiena-commits mailing list