У меня есть функция, которая направлена на:Как эффективно использовать do.call функции в пользовательской функции с параллельными вычислениями
- имитировать два набора данных в соответствии с известным набором параметров двух моделей (нулевой и альтернативный)
- повторно подходят для обоих моделей моделируемых данных
Я хочу ускорить время вычисления с помощью параллельного пакета в сочетании с pblapply пакетом.
Вот функция:
simulate.data <- function (tree, null_m, alt_m, nsim = 5, do.parallel = T, optTEXT = NULL){
## null_m and alt_m are fitted using mvMORPH function
library(mvMORPH)
if (!all (class (null_m)[1] == "mvmorph" & class (alt_m)[1] == "mvmorph"))
stop ("Fitted models must be of class 'mvmorph'")
## define functions
transform <- function (x){
if (is.matrix (x)) {
res <- vector ("list", ncol (x))
for (i in 1:ncol (x)){
res[[i]] <- x[,i]
}
}
else {
res <- x
}
res
}
find_fun <- function (x){
class.temp <- class (x)[2]
if (class.temp == "mvmorph.bm") return ("mvBM")
if (class.temp == "mvmorph.ou") return ("mvOU")
if (class.temp == "mvmorph.shift") return ("mvSHIFT")
}
## take arguments of null and alternative fit
call.fun.A <- find_fun (null_m)
argsA <- null_m$param [names (null_m$param) %in% names (as.list (args (call.fun.A)))]
argsA <- lapply (argsA, function (x) if (length(x)>1) x[1]
else x)
call.fun.B <- find_fun(alt_m)
argsB <- alt_m$param [names (alt_m$param) %in% names (as.list (args (call.fun.B)))]
argsB <- lapply (argsB, function (x) if (length(x)>1) x[1]
else x)
## simulate datasets under null and alternative model
A.df <- transform (simulate(object = null_m, tree = tree, nsim = nsim))
B.df <- transform (simulate(object = alt_m, tree = tree, nsim = nsim))
## refit null (A) and alternative (B) model to simulated data
# AA: fit null model to data simulated under null model
library(pbapply)
op <- pboptions(type = "timer") # default
if (do.parallel){
library(parallel)
cl <- makeCluster(detectCores()-1)
clusterEvalQ (cl, library(mvMORPH))
clusterExport (cl, varlist=c("tree", ## tree
"A.df", "B.df", ## simulated data
"call.fun.A", "call.fun.B", ## values of these objects are names of mvMORPH functions to be called with do.call function
"argsA", "argsB"), envir=environment()) ## 'args' objects specify arguments to be passed to do.call function
clusterExport (cl, varlist = "do.call")
cat (paste0 ("\nfitting models to simulated data under the null model (", argsA$model, ")\n"))
AA <- pblapply (X = A.df, FUN = function(x)
do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))), cl = cl)
AB <- pblapply (X = A.df, FUN = function(x)
do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))), cl = cl)
cat (paste0 ("\nfitting models to simulated data under the alternative model (", argsB$model, ")\n"))
BA <- pblapply (X = B.df, FUN = function(x)
do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))), cl = cl)
BB <- pblapply (X = B.df, FUN = function(x)
do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))), cl = cl)
stopCluster(cl)
}
else {
cat (paste0 ("\nfitting models to simulated data under the null model (", argsA$model, ")\n"))
AA <- pblapply (X = A.df, FUN = function(x)
do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))))
AB <- pblapply (X = A.df, FUN = function(x)
do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))))
cat (paste0 ("\nfitting models to simulated data under the alternative model (", argsB$model, ")\n"))
BA <- pblapply (X = B.df, FUN = function(x)
do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))))
BB <- pblapply (X = B.df, FUN = function(x)
do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))))
}
res <- list (A = null_m, B = alt_m, AA = AA, AB = AB, BA = BA, BB = BB)
class (res) <- append (class(res),"sim.data")
if (!is.null(optTEXT)){
attributes (res) <- c (attributes(res), comment = optTEXT)
res
}
else res
}
Эта функция работает, но мне кажется, что есть узкое место во время параллельных вычислительных процедур. Я подозреваю, что do.call функция представила избыточность, но я не уверен ... Мне все еще нужно реализовать do.call или какую-то другую подобную функцию, так как мне нужно подавать список аргументов внутри pblapply, а аргументы специфичны для каждый подходит.
Чтобы продемонстрировать отсутствие производительности во время параллельных вычислений, я моделируются и используются следующие данные:
library (phytools)
Generating a tree with 80 tips
set.seed(789)
tree <- pbtree (n = 80)
# Setting the regime states of tip species
regimes <- as.vector(c(rep("R1",40), rep ("R2", 40)))
names(regimes) <- tree$tip.label
tree <- make.simmap (tree, regimes , model="ER", nsim=1)
# Simulate data
library (mvMORPH)
sigma <- c (R1 = 3, R2 = 0.5)
theta <- 0
# Simulate data under the "BMM" model
data <- mvSIM (tree, nsim = 1, model="BMM", param = list (sigma = sigma, theta = theta))
# Fit models
fit1 <- mvBM (tree = tree, data = data, model = "BMM", method = "sparse")
fit2 <- mvOU (tree = tree, data = data, model = "OUM", method = "pseudoinverse", param = list (maxit = 50000))
## run the function
ss.data <- simulate.data(tree = tree, null_m = fit1, alt_m = fit2, nsim = 100, do.parallel = T)
На моем компьютере с i3 CPU, я использовал 3 рабочих и получили следующие результаты:
>fitting models to simulated data under the null model (BMM)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 14s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 01m 56s
>fitting models to simulated data under the alternative model (OUM)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 01m 51s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 03m 12s
Когда я запускаю то же, что и выше, но без вычисления параллельных вычислений (do.parallel = F) в целом занимает меньше времени:
>fitting models to simulated data under the null model (BMM)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 32s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 01m 23s
>fitting models to simulated data under the alternative model (OUM)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 09s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 02m 02s
После этого я просто запускаю часть своей функции в глобальной среде (не внутри функции), но используя параллельные вычисления. Код и результаты заключаются в следующем:
cl <- makeCluster(detectCores()-1)
clusterEvalQ (cl, library(mvMORPH))
clusterExport (cl, varlist=c("tree",
"A.df", "B.df",
"call.fun.A", "call.fun.B",
"argsA", "argsB"), envir=environment())
clusterExport (cl, varlist = "do.call")
>fitting models to simulated data under the null model (BMM)
AA <- pblapply (X = A.df, FUN = function(x)
do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))), cl = cl)
AB <- pblapply (X = A.df, FUN = function(x)
do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))), cl = cl)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 26s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 57s
>fitting models to simulated data under the alternative model (OUM)
BB <- pblapply (X = B.df, FUN = function(x)
do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))), cl = cl)
BA <- pblapply (X = B.df, FUN = function(x)
do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))), cl = cl)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 17s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 49s
stopCluster(cl)
Обратите внимание, что время параллельных вычислений в глобальной среде значительно ниже, чем в моей пользовательской функции ...
Наконец, я просто параллельные вычисления в глобальной окружающей среды, но без функции do.call который оказался наиболее эффективным:
cl <- makeCluster(detectCores()-1)
clusterEvalQ (cl, library(mvMORPH))
clusterExport (cl, varlist=c("tree",
"A.df", "B.df"), envir=environment())
>fitting models to simulated data under the null model (BMM)
AA <- pblapply (X = A.df, FUN = function(x)
mvBM (tree = tree, data = x, model = "BMM", method = "sparse", optimization = "L-BFGS-B", diagnostic=FALSE, echo=FALSE), cl = cl)
AB <- pblapply (X = A.df, FUN = function(x)
mvOU (tree = tree, data = x, model = "OUM", method = "pseudoinverse", optimization = "L-BFGS-B", diagnostic=FALSE, echo=FALSE), cl = cl)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 19s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 49s
>fitting models to simulated data under the alternative model (OUM)
BA <- pblapply (X = B.df, FUN = function(x)
mvBM (tree = tree, data = x, model = "BMM", method = "sparse", optimization = "L-BFGS-B", diagnostic=FALSE, echo=FALSE), cl = cl)
BB <- pblapply (X = B.df, FUN = function(x)
mvOU (tree = tree, data = x, model = "OUM", method = "pseudoinverse", optimization = "L-BFGS-B", diagnostic=FALSE, echo=FALSE), cl = cl)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 09s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 41s
stopCluster(cl)
Я ценю любое предложение и/или решения, которые могли бы помочь мне реализовать do.call в моем с более эффективной производительностью в сочетании с параллельной обработкой.