data.frame内の条件に基づいて累積的に数値を加算するループを最適化することに問題があります。以下近い万行の大きなデータセットから数行を含む入力data.frameれる:data.frame(大型データセット)内の条件に基づく累積加算のループ
outputData <- structure(list(SNP_pos = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L), .Label = c("SNP_1", "SNP_2", "SNP_3", "SNP_4", "SNP_5", "SNP_6", "SNP_7", "SNP_8", "SNP_9", "SNP_10", "SNP_11", "SNP_12", "SNP_13", "SNP_14"), class = "factor"), sample_id = c(8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L), allele1 = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"), sample_id_x = c(8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9152L, 9152L), allele2 = structure(c(2L, 2L, 1L, 1L, 2L, 2L, 1L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 4L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"), snp_diff = c(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0), IBS = c(1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1), IBD = c(NA, NA, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 1, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 0, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 1, 0, 1)), .Names = c("SNP_pos", "sample_id", "allele1", "sample_id_x", "allele2", "snp_diff", "IBS", "IBD"), row.names = c(NA, 100L), class = "data.frame")
:
inputData <- structure(list(SNP_pos = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L), .Label = c("SNP_1", "SNP_2", "SNP_3", "SNP_4", "SNP_5", "SNP_6", "SNP_7", "SNP_8", "SNP_9", "SNP_10", "SNP_11", "SNP_12", "SNP_13", "SNP_14"), class = "factor"), sample_id = c(8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L, 8685L), allele1 = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 1L, 2L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"), sample_id_x = c(8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8739L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8832L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 8888L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9056L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9058L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9062L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9072L, 9152L, 9152L), allele2 = structure(c(2L, 2L, 1L, 1L, 2L, 2L, 1L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 4L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 4L, 4L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 4L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 2L), .Label = c("A", "G", "T", "C"), class = "factor"), snp_diff = c(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0), IBS = c(1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1), IBD = c(1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1)), .Names = c("SNP_pos", "sample_id", "allele1", "sample_id_x", "allele2", "snp_diff", "IBS", "IBD"), row.names = c(NA, 100L), class = "data.frame")
以下は、期待出力data.frameあります以下は
私は、出力ファイルを生成するために使用していたコードです:
for (i in 1:nrow(inputData)) { inputData$IBD<-ifelse(inputData$IBD==0,inputData$IBD<-inputData$IBD,ifelse (inputData$allele1==inputData$allele2&inputData$sample_id_x!=shift(inputData$sample_id_x),inputData$IBD<-inputData$IBD,ifelse (inputData$allele1==inputData$allele2&inputData$sample_id_x==shift(inputData$sample_id_x),inputData$IBD<-shift(inputData$IBD)+1,inputData$IBD<-inputData$IBD))) }
- 第1の条件は、それが0
- 第2の条件は、次にカラムallele1かどうかをチェックするようにIBDを残すカラムIBD == 0かどうか、およびそうである場合と比較== allele2もsample_id_xであるか否か、など以前のsample_id_x(上のサンプルID)と等しくなります。この条件が満たされれば、IBDは同じままでなければならない。
- 最後に、column allele1 == allele2およびsample_id_x ==以前のsample_id_x(それよりも上のもの)の場合は、前のIBD(上のもの)にIBDを追加します。 上記のコードは動作しますが、何年も実行されています。
が親切にコードを最適化したり、より良いものを提案して援助を必要とする...
はd.b @ありがとうございます。あなたは命を救っています。長い間、これは私たちに「私たちの頭をレンガの壁に打ちつける」ものでした... –