计算效率高的方法来操作大型深度嵌套对象的级别？-解网

问：

我有一个向量列表列表（不是错别字，再次确认它实际上是一个向量列表列表），长度为 7600 万。因此，有一个包含 7600 万个项目的列表，其中每个项目都是两个向量的列表。

所有向量均为均匀长度（6 项）。

例如，数据本身如下所示：list_of_list[1:50]

dput输出

list(list(c(4, 4, 1, 0, 1, 0), c(3, 3, 2, 2, 0, 0)), list(c(4, 
4, 1, 0, 1, 0), c(3, 4, 3, 1, 0, 0)), list(c(4, 4, 1, 0, 1, 0
), c(4, 5, 1, 0, 0, 1)), list(c(4, 4, 1, 0, 1, 0), c(5, 8, 0, 
0, 0, 1)), list(c(4, 4, 1, 0, 1, 0), c(5, 5, 0, 2, 0, 0)), list(
    c(4, 4, 1, 0, 1, 0), c(7, 11, 0, 0, 0, 0)), list(c(4, 4, 
1, 0, 1, 0), c(4, 5, 1, 0, 0, 1)), list(c(4, 4, 1, 0, 1, 0), 
    c(4, 4, 1, 0, 1, 0)), list(c(4, 4, 1, 0, 1, 0), c(6, 10, 
1, 0, 0, 0)), list(c(4, 4, 1, 0, 1, 0), c(3, 4, 3, 1, 0, 0)), 
    list(c(4, 4, 1, 0, 1, 0), c(5, 7, 2, 0, 0, 0)), list(c(4, 
    4, 1, 0, 1, 0), c(40, 10, 0, 15, 8, 0)), list(c(4, 4, 1, 
    0, 1, 0), c(24L, 7L, 6L, 20L, 8L, 1L)), list(c(4, 4, 1, 0, 
    1, 0), c(39L, 22L, 9L, 5L, 8L, 1L)), list(c(4, 4, 1, 0, 1, 
    0), c(34, 36, 17, 15, 0, 2)), list(c(4, 4, 1, 0, 1, 0), c(36L, 
    42L, 18L, 4L, 5L, 1L)), list(c(4, 4, 1, 0, 1, 0), c(4, 5, 
    1, 0, 0, 1)), list(c(4, 4, 1, 0, 1, 0), c(4, 8, 3, 0, 0, 
    0)), list(c(4, 4, 1, 0, 1, 0), c(3, 1, 2, 2, 0, 0)), list(
        c(4, 4, 1, 0, 1, 0), c(6, 9, 0, 1, 0, 0)), list(c(4, 
    4, 1, 0, 1, 0), c(5, 5, 0, 2, 0, 0)), list(c(4, 4, 1, 0, 
    1, 0), c(6, 10, 1, 0, 0, 0)), list(c(4, 4, 1, 0, 1, 0), c(6, 
    10, 1, 0, 0, 0)), list(c(4, 4, 1, 0, 1, 0), c(7, 15, 0, 0, 
    0, 0)), list(c(4, 4, 1, 0, 1, 0), c(7, 11, 0, 0, 0, 0)), 
    list(c(4, 4, 1, 0, 1, 0), c(4, 2, 1, 2, 0, 0)), list(c(4, 
    4, 1, 0, 1, 0), c(28, 24, 19, 14, 4, 0)), list(c(4, 4, 1, 
    0, 1, 0), c(40, 56, 19, 11, 0, 0)), list(c(4, 4, 1, 0, 1, 
    0), c(32L, 33L, 14L, 17L, 1L, 2L)), list(c(4, 4, 1, 0, 1, 
    0), c(24L, 55L, 11L, 16L, 6L, 1L)), list(c(4, 4, 1, 0, 1, 
    0), c(27, 10, 6, 19, 8, 0)), list(c(4, 4, 1, 0, 1, 0), c(31, 
    21, 11, 19, 4, 0)), list(c(4, 4, 1, 0, 1, 0), c(37L, 60L, 
    12L, 7L, 5L, 1L)), list(c(4, 4, 1, 0, 1, 0), c(29L, 8L, 3L, 
    18L, 8L, 1L)), list(c(4, 4, 1, 0, 1, 0), c(21L, 24L, 20L, 
    14L, 5L, 1L)), list(c(4, 4, 1, 0, 1, 0), c(6, 10, 1, 0, 0, 
    0)), list(c(4, 4, 1, 0, 1, 0), c(5, 9, 2, 0, 0, 0)), list(
        c(4, 4, 1, 0, 1, 0), c(7, 13, 0, 0, 0, 0)), list(c(4, 
    4, 1, 0, 1, 0), c(6, 12, 1, 0, 0, 0)), list(c(4, 4, 1, 0, 
    1, 0), c(5, 8, 1, 1, 0, 0)), list(c(4, 4, 1, 0, 1, 0), c(5, 
    7, 0, 2, 0, 0)), list(c(4, 4, 1, 0, 1, 0), c(7, 11, 0, 0, 
    0, 0)), list(c(4, 4, 1, 0, 1, 0), c(5, 6, 1, 1, 0, 0)), list(
        c(4, 4, 1, 0, 1, 0), c(4, 3, 0, 3, 0, 0)), list(c(4, 
    4, 1, 0, 1, 0), c(3, 2, 3, 1, 0, 0)), list(c(4, 4, 1, 0, 
    1, 0), c(4, 4, 1, 2, 0, 0)), list(c(4, 4, 1, 0, 1, 0), c(3, 
    3, 2, 2, 0, 0)), list(c(4, 4, 1, 0, 1, 0), c(5, 7, 0, 2, 
    0, 0)), list(c(4, 4, 1, 0, 1, 0), c(3, 1, 2, 2, 0, 0)), list(
        c(4, 4, 1, 0, 1, 0), c(6, 7, 0, 1, 0, 0)))

仅供参考，列表列表是使用此模板制作的：combn()combn(focal_list,2,simplify = FALSE)

有没有一种计算有效的方法可以将其转换为包含两列的表，其中每行是列表列表中的一个项目？所有第一个向量都变成第一列，所有第二个向量都变成第二列？

我尝试了以下方法，这在 10-12 分钟后继续运行，没有输出，这对于我的用例来说太昂贵了：

dt <- data.table(col1 = lapply(1:length(list_of_list), function(x) list_of_list[[x]][1]),
                 col2 = lapply(1:length(list_of_list), function(x) list_of_list[[x]][2])))

我可以使用一个循环来解开深度嵌套的对象，并在向量中读取为由简单字符分隔的字符，然后使用另一个循环创建一个，但在我这样做之前，R 中是否有更简单的方法我错过了？foreachforeachdata.table

请注意，为了澄清，我想保持最低级别项目的类似性质，即当您从列表列表中制作表格时，每个项目都应该是一个向量，data.table 应该是两列，似乎 R 喜欢扁平化，并且在尝试制作表格时。vector()vectorslist

R 优化 data.table nested-lists nested-object

我建议您像下面的代码一样使用 Rcpp。既然你有 7600 万，我重新评论分批运行数据，即每批 1000 万。在我的电脑中，将 1000 万转换为矩阵需要 8 秒。这意味着如果您这样做 8 次，大约需要 70-80 秒。存储不同的矩阵匹配，然后将它们组合成一个，可能是将它们写入硬盘驱动器中的一个文件中。

Rcpp::cppFunction(
'NumericVector combineList(std::vector< std::vector<std::vector<double>>> x){
    int n = x.size();
    int m = x[0].size();
    int p = x[0][0].size();
    std::vector<double> y(n*p*m);
    for(int i = 0; i < n; i++)
        for(int j = 0; j < m; j++)
            for(int k = 0; k < p; k++)
                y[p * (i + n * j) + k] = x[i][j][k];
    NumericVector z = wrap(y);
    z.attr("dim") = Dimension(n*p, m);
    return z;
}'
)

combineList(list_of_lists)
       [,1] [,2]
  [1,]    4    3
  [2,]    4    3
  [3,]    1    2
  [4,]    0    2
  [5,]    1    0
  [6,]    0    0
  [7,]    4    3
  [8,]    4    4
  [9,]    1    3
 [10,]    0    1
 [11,]    1    0
 [12,]    0    0
 [13,]    4    4
 [14,]    4    5
 [15,]    1    1
 [16,]    0    0
 [17,]    1    0
 [18,]    0    1
 [19,]    4    5
 [20,]    4    8

0赞 Sudoh 7/14/2023 #3

我能够使用这一行代码相当轻松地解决这个问题：

rbindlist(rapply(focal_list, list, how = "replace"))

令人着迷的是，上面的代码在大约 2 分钟内处理了所有 7600 万个项目，不需要（不能说软件包是否在引擎盖下使用）。RcppRcpp

上一个：如何让 IJulia 在 mamba 环境中工作？

下一个：如何在特定索引处插入 polars.lit（“some_string”）？

计算效率高的方法来操作大型深度嵌套对象的级别？

computationally efficient way to manipulate the levels of large deeply-nested objects?

评论

评论