# GFM: alternate maximization and information criterion

#### 2023-08-10

In this tutorial, we show that the alternate maximization (AM) is used in the first step of the two-step estimation method and the information criterion (IC) method is adopted to choose the number of factors.

## Fit GFM model using simulated data

The package can be loaded with the command:

library("GFM")
set.seed(1) # set a random seed for reproducibility.

### GFM can handle data with homogeneous normal variables

First, we generate the data with homogeneous normal variables.

## Homogeneous  normal variables
dat <- gendata(q = 2, n=100, p=100, rho=3)

Then, we set the algorithm parameters and fit model

# Obtain the observed data
XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is  continous variable type.
types <- 'gaussian' 

Third, we fit the GFM model with user-specified number of factors.

# specify q=2
gfm1 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE)

# measure the performance of GFM estimators in terms of canonical correlations
measurefun(gfm1$hH, dat$H0, type='ccor')
measurefun(gfm1$hB, dat$B0, type='ccor')

The number of factors can also be determined by data-driven manners.

# select q automatically
hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:6, verbose = FALSE, parallelList=list(parallel=TRUE))
hq

### GFM outperforms LFM in analyzing data with heterogeous normal variables

First, we generate the data with heterogeous normal variables and set the parameters of algorithm.

  dat <- gendata(seed=1, n=100, p=100, type='heternorm', q=2, rho=1)
# Obtain the observed data
XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is  continous variable type.
types <- 'gaussian' 

Third, we fit the GFM model with user-specified number of factors and compare the results with that of linear factor models.

# specify q=2
gfm1 <- gfm(XList, types,  algorithm="AM", q=2, verbose = FALSE)

# measure the performance of GFM estimators in terms of canonical correlations
corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor')
corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor')

lfm1 <- Factorm(X, q=2)
corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor')
corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor')

library(ggplot2)
df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm),
Method =factor(rep(c('GFM', "LFM"), times=2)),
ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)

The number of factors can also be determined by data-driven manners.

# select q automatically
hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:6, verbose = FALSE, parallelList=list(parallel=TRUE))

### GFM outperforms LFM in analyzing data with Count(Poisson) variables

First, we generate the data with Count(Poisson) variables and set the parameters of algorithm.

  q <- 3; p <- 200
dat <- gendata(seed=1, n=200, p=p, type='pois', q=q, rho=4)
# Obtain the observed data
XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is  continous variable type.
types <- 'poisson'

Second, we we fit the GFM models given the true number of factors.

  system.time(
gfm1 <- gfm(XList, types,  algorithm="AM", q=3, verbose = FALSE)
)
system.time(
hq <- chooseFacNumber(XList, types, q_set=1:6, select_method = "IC", parallelList=list(parallel=TRUE))
)

Third, we compare the results with that of linear factor models.


# measure the performance of GFM estimators in terms of canonical correlations
corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor')
corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor')

lfm1 <- Factorm(X, q=3)
corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor')
corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor')

library(ggplot2)
df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm),
Method =factor(rep(c('GFM', "LFM"), times=2)),
ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)

### GFM outperforms LFM in analyzing data with the mixed-types of count and categorical variables

First, we generate the data with Count(Poisson) variables and set the parameters of algorithm. Then fit the GFM model with user-specified number of factors.

  dat <- gendata(seed=1, n=200, p=200, type='pois_bino', q=2, rho=2)
# Obtain the observed data
XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is  continous variable type.
types <- dat$types table(dat$X[,1])
table(dat$X[, 200]) # user-specified q=2 gfm2 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE) measurefun(gfm2$hH, dat$H0, type='ccor') measurefun(gfm2$hB, dat$B0, type='ccor') Third, we compare the results with that of linear factor models.  # select q automatically hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:4, verbose = FALSE, parallelList=list(parallel=TRUE)) # measure the performance of GFM estimators in terms of canonical correlations corH_gfm <- measurefun(gfm2$hH, dat$H0, type='ccor') corB_gfm <- measurefun(gfm2$hB, dat$B0, type='ccor')  Compare with linear factor models  lfm1 <- Factorm(dat$X, q=3)
corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor')
corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor')

library(ggplot2)
df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm),
Method =factor(rep(c('GFM', "LFM"), times=2)),
ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)

## Session information

sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 22621)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=C
#> [2] LC_CTYPE=Chinese (Simplified)_China.936
#> [3] LC_MONETARY=Chinese (Simplified)_China.936
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=Chinese (Simplified)_China.936
#>
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base
#>
#> loaded via a namespace (and not attached):
#>  [1] digest_0.6.29   R6_2.5.1        jsonlite_1.8.0  magrittr_2.0.3
#>  [5] evaluate_0.15   stringi_1.7.6   rlang_1.1.0     cli_3.2.0
#>  [9] rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1     rmarkdown_2.11
#> [13] tools_4.1.2     stringr_1.4.0   xfun_0.29       yaml_2.3.6
#> [17] fastmap_1.1.0   compiler_4.1.2  htmltools_0.5.2 knitr_1.37
#> [21] sass_0.4.1