提问人:bvowe 提问时间:4/5/2020 最后编辑:MichaelChiricobvowe 更新时间:4/6/2020 访问量:791
R 将列表转换为 Data.Frame 或表格
R Convert List Into Data.Frame or Table
问:
list1 = list(
c(4,5,6,7,1,1,1,1,3,1,3,3),
c(3,4,5,6,2,2,2,2,1,4,2,1),
c(1,2,3,4,1,1,1,1,3,2,1,1),
c(5,6,7,8,1,1,1,1,4,4,4,3),
c(2,3,4,5,2,2,2,2,2,1,2,1)
)
data1=data.frame("ID"=c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5),
"Time"=c(1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4),
"Grade"=c(4,5,6,7,3,4,5,6,1,2,3,4,5,6,7,8,2,3,4,5),
"Class"=c(1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2),
"Score"=c(3,1,3,3,1,4,2,1,3,2,1,1,4,4,4,3,2,1,2,1))
我有“list1”中“list1”中的每个项目都等于一个人 4 年的成绩、班级、分数。 因此,“list1”有 5 名学生,每个学生有 12 条记录(成绩、班级和分数这三个变量中的每一个都有 4 条记录)。我希望将“list1”转换为“data1”,这是一个长数据文件,其中“ID”等于“list1”中的列表项编号。时间等于记录的时间(每个学生有 4 个时间度量),成绩等于 list1 中所有元素中的前 4 个数据点,Class 等于接下来的 4 个数据点,Score 等于最后 4 个。
示例输出显示将“list1”转换为所需输出“data1”。
这个数据集是巨大的,所以我希望有一种有效的方法来进行这种转换。
答:
一个解决方案可以是:purrr
dplyr
map_dfr(.x = list1,
~ as.data.frame(matrix(.x, 4, 3)) %>%
setNames(c("Grade", "Class", "Score")), .id = "ID") %>%
group_by(ID) %>%
mutate(Time = 1:n())
ID Grade Class Score Time
<chr> <dbl> <dbl> <dbl> <int>
1 1 4 1 3 1
2 1 5 1 1 2
3 1 6 1 3 3
4 1 7 1 3 4
5 2 3 2 1 1
6 2 4 2 4 2
7 2 5 2 2 3
8 2 6 2 1 4
9 3 1 1 3 1
10 3 2 1 2 2
11 3 3 1 1 3
12 3 4 1 1 4
13 4 5 1 4 1
14 4 6 1 4 2
15 4 7 1 4 3
16 4 8 1 3 4
17 5 2 2 2 1
18 5 3 2 1 2
19 5 4 2 2 3
20 5 5 2 1 4
使用基础 R,我们可以遍历每个列表的索引并创建一个数据帧。list1
do.call(rbind, lapply(seq_along(list1), function(i)
data.frame(ID = i, Time = 1:4, Grade = list1[[i]][1:4],
Class = list1[[i]][5:8], Score = list1[[i]][9:12])))
# ID Time Grade Class Score
#1 1 1 4 1 3
#2 1 2 5 1 1
#3 1 3 6 1 3
#4 1 4 7 1 3
#5 2 1 3 2 1
#6 2 2 4 2 4
#7 2 3 5 2 2
#8 2 4 6 2 1
#9 3 1 1 1 3
#10 3 2 2 1 2
#11 3 3 3 1 1
#12 3 4 4 1 1
#13 4 1 5 1 4
#14 4 2 6 1 4
#15 4 3 7 1 4
#16 4 4 8 1 3
#17 5 1 2 2 2
#18 5 2 3 2 1
#19 5 3 4 2 2
#20 5 4 5 2 1
评论
data.table
使用 1000 万个数据点
数据:
list1 = list(
c(4,5,6,7,1,1,1,1,3,1,3,3),
c(3,4,5,6,2,2,2,2,1,4,2,1),
c(1,2,3,4,1,1,1,1,3,2,1,1),
c(5,6,7,8,1,1,1,1,4,4,4,3),
c(2,3,4,5,2,2,2,2,2,1,2,1))
big_list <- unlist(mget(x = rep('list1', 100000)), recursive = FALSE)
代码: - 使用 Base-R: split()
system.time({
col_levels <- rep(c('Grade', 'Class', 'Score'), each = 4)
for(x in seq_along(big_list)){
big_list[[x]] <- do.call('cbind', list(ID = x, Time = 1:4,
do.call('cbind', split(big_list[[x]], col_levels))))
}
final_df <- do.call('rbind', big_list)
})
# user system elapsed
# 82.86 0.31 83.78
比较:使用 data.table
@MichaelChirico
library('data.table')
system.time({
# 4 = # of Times per ID&Column (assuming your table is rectangular)
out = CJ(ID = 1:length(big_list), Time = 1:4)
# relies on ID being an integer, so that ID = 1 --> list1[[1]]
# gives ID=1's data
out[ , by = ID, c('Grade', 'Class', 'Score') := {
as.data.table(matrix(big_list[[ .BY$ID ]], ncol = 3L))
}]
})
# user system elapsed
# 76.22 0.25 76.80
输出
dim(final_df)
# [1] 2000000 5
head(final_df)
# ID Time Class Grade Score
# [1,] 1 1 1 4 3
# [2,] 1 2 1 5 1
# [3,] 1 3 1 6 3
# [4,] 1 4 1 7 3
# [5,] 2 1 2 3 1
# [6,] 2 2 2 4 4
我不确定它是否有效,但它很简洁:
setDT(list1)
# could also do something like paste0('student', 1:5) for clarity,
# and adjust patterns() below accordingly
setnames(list1, paste0(1:5))
# 4 = # of values of Time
list1[ , colid := rep(c('Grade', 'Class', 'Score'), each = 4L)]
# 3 = # of columns "stacked" in each student's column initially
list1[ , Time := rep(1:4, 3L)]
# first, reshape long
list1[ , melt(.SD, measure.vars = patterns('^[0-9]+'), variable.name = 'ID',
variable.factor = FALSE)
# now, reshape to the final format
][ , dcast(.SD, ID + Time ~ colid, value.var = 'value')]
# ID Time Class Grade Score
# <char> <int> <num> <num> <num>
# 1: 1 1 1 4 3
# 2: 1 2 1 5 1
# 3: 1 3 1 6 3
# 4: 1 4 1 7 3
# 5: 2 1 2 3 1
# 6: 2 2 2 4 4
# 7: 2 3 2 5 2
# 8: 2 4 2 6 1
# 9: 3 1 1 1 3
# 10: 3 2 1 2 2
# 11: 3 3 1 3 1
# 12: 3 4 1 4 1
# 13: 4 1 1 5 4
# 14: 4 2 1 6 4
# 15: 4 3 1 7 4
# 16: 4 4 1 8 3
# 17: 5 1 2 2 2
# 18: 5 2 2 3 1
# 19: 5 3 2 4 2
# 20: 5 4 2 5 1
# ID Time Class Grade Score
效率低下将来自在这里进行两次操作。
首先构建表框架,然后填充它的方法可能更快,如下所示:
# 4 = # of Times per ID&Column (assuming your table is rectangular)
out = CJ(ID = 1:length(list1), Time = 1:4)
# relies on ID being an integer, so that ID = 1 --> list1[[1]]
# gives ID=1's data
out[ , by = ID, c('Grade', 'Class', 'Score') := {
as.data.table(matrix(list1[[ .BY$ID ]], ncol = 3L))
}]
这可能也是低效的,但这段代码比替代方案更具可读性:as.data.table
out = CJ(ID = 1:length(list1), Time = 1:4)
out[ , by = ID, c('Grade', 'Class', 'Score') := {
student_data = list1[[.BY$ID]]
lapply(1:3, function(j) student_data[4L*(j-1) + 1:4])
}]
这是另一个非常快速的基本解决方案。它不那么优雅,但这个想法是,我们通过用循环填充矩阵来最大限度地减少内存使用。
mat = matrix(0, nrow = length(list1) * 4L, ncol = 5L, dimnames = list(NULL, c("ID", "Time", "Grade", "Class", "Score")))
rw = 0L
times = 1:4
for (i in seq_along(list1)) {
l = list1[[i]]
new_rw = length(l) / 3
inds = seq_len(new_rw) + rw
mat[inds, 1L] = i
mat[inds, 2L] = times
mat[inds, 3:5] = matrix(l, ncol = 3L)
rw = new_rw + rw
}
这里有一个更快的方法,它通过按一定顺序选择我们的未列出元素来取消列出,然后创建一个矩阵:
n = length(list1)
matrix(unlist(list1, use.names = FALSE)[rep(rep(1:4, n) + 12 * rep(0:(n-1L), each = 4), 3) + rep(c(0, 4, 8), each = n * 4L)], ncol = 3)
最后,如果你仍然需要速度,可以使用:Rcpp
Rcpp::cppFunction(
" NumericMatrix rcpp_combo(List x) {
NumericMatrix out(x.size() * 4, 5);
int init = 0;
for (int i = 0; i < x.size(); i++) {
NumericVector tmp = x(i);
int ID = i + 1;
for (int j = 0; j < 4; j++) {
int ind = j + init;
out(ind, 0) = ID;
out(ind, 1) = j + 1;
out(ind, 2) = tmp(j);
out(ind, 3) = tmp(4 + j);
out(ind, 4) = tmp(8 + j);
}
init += 4;
}
return(out);
}"
)
rcpp_combo(list1)
使用 @Sathish 的基准测试,这些方法介于 0.05 到 2 秒之间。
big_list <- unlist(mget(x = rep('list1', 100000)), recursive = FALSE)
system.time(rcpp_combo(big_list))
## user system elapsed
## 0.07 0.00 0.06
system.time({
n = length(big_list)
mat2 = matrix(unlist(big_list, use.names = FALSE)[rep(rep(1:4, n) + 12 * rep(0:(n-1L), each = 4), 3) + rep(c(0, 4, 8), each = n * 4L)], ncol = 3)
})
## user system elapsed
## 0.20 0.02 0.22
big_list <- unlist(mget(x = rep('list1', 100000)), recursive = FALSE)
system.time({
mat = matrix(0, nrow = length(big_list) * 4L, ncol = 5L, dimnames = list(NULL, c("ID", "Time", "Grade", "Class", "Score")))
rw = 0L
times = 1:4
for (i in seq_along(big_list)) {
l = big_list[[i]]
new_rw = length(l) / 3
inds = seq_len(new_rw) + rw
mat[inds, 1L] = i
mat[inds, 2L] = times
mat[inds, 3:5] = matrix(l, ncol = 3L)
rw = new_rw + rw
}
})
## user system elapsed
## 2.08 0.03 2.21
评论