は、Iが正の大きな行列を有するR

マトリックス行と列の動作をスピードアップ：は、Iが正の大きな行列を有するR

set.seed(1) 
mat <- matrix(abs(rnorm(130000*1000)),nrow=130000,ncol=1000) 
rownames(mat) <- paste("r",1:nrow(mat),sep="")

matのrownamesをparent.idに関連付けられている：すべてのいくつかの行が関連付けられていること

row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(13000,130000,replace=T)),sep=""))

は、このようなparent.idと同じです。

Iはmatにすべてrowため、これらの動作を計算する必要がある：

行要素の log同じ parent.id

持つ全ての行のうち、その行のの

mean割合の

mean
meanすべての行のうちのその行の割合のプロビット同じparent.id

を持つすべての行のうち、その行の割合の同じparent.id

sdプロビットは、当然のことながら、これは心に来る最初のソリューションです：

require(VGAM) 
res.df <- do.call(rbind,mclapply(1:nrow(mat), function(x) { 
    idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) 
    data.frame(mean.log=mean(log(mat[x,])), 
      mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)), 
      mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))), 
      sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum)))) 
}))

しかし、私はこれをより速く達成する方法があるのだろうかと思っています。

P.S. as.numeric私はdata.table上で操作を実行するたびに適用する場合を除き

set.seed(1) 
mat <- matrix(abs(rnorm(13*5)),nrow=13,ncol=5) 
rownames(mat) <- paste("r",1:nrow(mat),sep="") 
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep="")) 

require(VGAM) 
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) { 
    idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) 
    data.frame(mean.log=mean(log(mat[x,])), 
      mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)), 
      mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))), 
      sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum)))) 
}))) 


Unit: milliseconds 
                                                                                                                          expr 
df <- do.call(rbind, lapply(1:nrow(mat), function(x) {  idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id ==   row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])  data.frame(mean.log = mean(log(mat[x, ])), mean.proportion = mean(mat[x,   ]/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(mat[x,   ]/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(mat[x,   ]/apply(mat[idx, ], 2, sum)))) })) 
     min  lq  mean median  uq  max neval 
10.15047 10.2894 10.69573 10.428 10.69741 14.56724 100

：

に比べ

require(data.table) 
require(microbenchmark) 
require(VGAM) 

set.seed(1) 
mat <- data.table(matrix(abs(rnorm(13*5)),nrow=13,ncol=5)) 
rownames(mat) <- paste("r",1:nrow(mat),sep="") 
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep="")) 

microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) { 
    idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) 
    data.frame(mean.log=mean(as.numeric(log(mat[x,]))), 
      mean.proportion=mean(as.numeric(mat[x,])/apply(mat[idx,],2,sum)), 
      mean.probit=mean(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))), 
      sd.probit=sd(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum)))) 
}))) 


expr 
df <- do.call(rbind, lapply(1:nrow(mat), function(x) {  idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id ==   row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])  data.frame(mean.log = mean(as.numeric(log(mat[x, ]))), mean.proportion = mean(as.numeric(mat[x,   ])/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(as.numeric(mat[x,   ])/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(as.numeric(mat[x,   ])/apply(mat[idx, ], 2, sum)))) })) 
     min  lq  mean median  uq  max neval 
65.08929 66.49415 69.78937 67.70534 70.38044 206.017 100 
>

：私が代わりに行列のdata.tableを使用しないと思います

は、移動するための方法であります行は悪い考えです。

出典

2016-12-28 dan

あなたはdata.tableを試しましたか？ –

ありません。良いアイデア。 – dan

これは実際にはあまりいい考えではありません。更新された投稿を参照してください。 – dan

私は行列の代わりにdata.table使用しないと思いますが、明らかに

を移動するための方法である、あなたが実際にdata.table使用する必要があります。これは、あなたが何らかの努力を費やさずにコードを最適化する魔法の杖ではありません。 data.table構文を使用する必要があります。

mean of log of the row elements 

mean proportion of of that row out of all rows with the same parent.id 

mean probit of the proportion of of that row out of all rows with the same parent.id 

sd probit of the proportion of of that row out of all rows with the same parent.id

私は、これはあなたが必要なものをやるかもしれないと思う：

library(data.table) 
DT <- data.table(row.ids.df, mat) 
DT <- melt(DT, id.vars = c("row.id", "parent.id")) 

DT[, proportion := value/sum(value), by = .(variable, parent.id)] 

res <- DT[, .(
    mean.log = mean(log(value)), 
    mean.proportion = mean(proportion), 
    mean.probit = mean(probit(proportion)), 
    sd.probit = sd(probit(proportion))), by = row.id] 

all.equal(res[["sd.probit"]], 
      res.df[["sd.probit"]]) 
#[1] TRUE 
#(Tested with 100 rows and 30 columns.)

私はそれがより効率的であることを期待し、私はマット内のすべての行に対してこれらの操作を計算する必要が

しかし、それは間違いなくより読みやすいです。

出典

2016-12-28 07:48:00 Roland

は、Iが正の大きな行列を有するR

答えて

関連する問題