将嵌套列表转换为数据帧

Converting nested list to dataframe

提问人:Ritchie Sacramento 提问时间:10/3/2014 最后编辑:moodymudskipperRitchie Sacramento 更新时间:6/1/2023 访问量:65400

问:

目标是将有时包含缺失记录的嵌套列表转换为数据框。缺少记录时的结构示例如下:

mylist <- list(
  list(
    Hit = "True",
    Project = "Blue",
    Year = "2011",
    Rating = "4",
    Launch = "26 Jan 2012",
    ID = "19",
    Dept = "1, 2, 4"
  ),
  list(Hit = "False", Error = "Record not found"),
  list(
    Hit = "True",
    Project = "Green",
    Year = "2004",
    Rating = "8",
    Launch = "29 Feb 2004",
    ID = "183",
    Dept = "6, 8"
  )
)

当没有缺失的记录时,可以使用 将列表转换为数据框。但是,当缺少记录时,这会导致列不匹配。我知道有一些函数可以合并不匹配列的数据框,但我还没有找到可以应用于列表的函数。理想的结果是将所有变量的记录 2 和 NA 保持在 NA 上。希望得到一些帮助。data.frame(do.call(rbind.data.frame, mylist))

r 数据帧 嵌套列表 rbind

评论


答:

22赞 flodel 10/3/2014 #1

您可以创建 data.frames 列表:

dfs <- lapply(mylist, data.frame, stringsAsFactors = FALSE)

然后使用以下方法之一:

library(plyr)
rbind.fill(dfs)

或更快

library(dplyr)
bind_rows(dfs) # in earlier versions: rbind_all(dfs)

在 的情况下,我很惊讶它选择使用而不是丢失的数据。如果你删除,你会得到,但代价是警告......这将是一个丑陋但快速的解决方案。dplyr::bind_rows""NAstringsAsFactors = FALSENAsuppressWarnings(rbind_all(lapply(mylist, data.frame)))

评论

17赞 psychonomics 1/31/2017
rbind_all()已弃用。请改用。bind_rows()
0赞 PM0087 11/21/2020
如果在某些行中,某些列缺少数据怎么办?数据库中只有空(没有 NA 或 NULL)
0赞 PM0087 11/21/2020
我收到此错误: Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, : arguments imply different number of rows: 1, 0
42赞 hrbrmstr 10/3/2014 #2

您还可以在软件包中使用(至少 v1.9.3):rbindlistdata.table

library(data.table)

rbindlist(mylist, fill=TRUE)

##      Hit Project Year Rating      Launch  ID    Dept            Error
## 1:  True    Blue 2011      4 26 Jan 2012  19 1, 2, 4               NA
## 2: False      NA   NA     NA          NA  NA      NA Record not found
## 3:  True   Green 2004      8 29 Feb 2004 183    6, 8               NA

评论

1赞 Arun 10/3/2014
1.9.4 现在可在 CRAN 上使用(尽管剩余的二进制文件可能需要一天的时间才能可用)。
6赞 msoderstrom 7/29/2018
@hrbrmstr您知道允许使用非统一列表结构的解决方法吗?我遇到了.rbind/rbindlist doesn't recycle as it already expects each item to be a uniform list, data.frame or data.table
1赞 PM0087 11/21/2020
我收到此错误:data.table::rbindlist(mylist, fill = TRUE) 中的错误:第 3 项的第 1 列长度为 2,与长度为 3 的第 5 列不一致。只有长度为 1 的色谱柱被回收。
0赞 PM0087 2/18/2021
嵌套列表怎么样?
12赞 bgoldst 2/21/2016 #3

我刚刚为这个问题开发了一个适用于这里的解决方案,所以我也会在这里提供它:

tl <- function(e) { if (is.null(e)) return(NULL); ret <- typeof(e); if (ret == 'list' && !is.null(names(e))) ret <- list(type='namedlist') else ret <- list(type=ret,len=length(e)); ret; };
mkcsv <- function(v) paste0(collapse=',',v);
keyListToStr <- function(keyList) paste0(collapse='','/',sapply(keyList,function(key) if (is.null(key)) '*' else paste0(collapse=',',key)));

extractLevelColumns <- function(
    nodes, ## current level node selection
    ..., ## additional arguments to data.frame()
    keyList=list(), ## current key path under main list
    sep=NULL, ## optional string separator on which to join multi-element vectors; if NULL, will leave as separate columns
    mkname=function(keyList,maxLen) paste0(collapse='.',if (is.null(sep) && maxLen == 1L) keyList[-length(keyList)] else keyList) ## name builder from current keyList and character vector max length across node level; default to dot-separated keys, and remove last index component for scalars
) {
    cat(sprintf('extractLevelColumns(): %s\n',keyListToStr(keyList)));
    if (length(nodes) == 0L) return(list()); ## handle corner case of empty main list
    tlList <- lapply(nodes,tl);
    typeList <- do.call(c,lapply(tlList,`[[`,'type'));
    if (length(unique(typeList)) != 1L) stop(sprintf('error: inconsistent types (%s) at %s.',mkcsv(typeList),keyListToStr(keyList)));
    type <- typeList[1L];
    if (type == 'namedlist') { ## hash; recurse
        allKeys <- unique(do.call(c,lapply(nodes,names)));
        ret <- do.call(c,lapply(allKeys,function(key) extractLevelColumns(lapply(nodes,`[[`,key),...,keyList=c(keyList,key),sep=sep,mkname=mkname)));
    } else if (type == 'list') { ## array; recurse
        lenList <- do.call(c,lapply(tlList,`[[`,'len'));
        maxLen <- max(lenList,na.rm=T);
        allIndexes <- seq_len(maxLen);
        ret <- do.call(c,lapply(allIndexes,function(index) extractLevelColumns(lapply(nodes,function(node) if (length(node) < index) NULL else node[[index]]),...,keyList=c(keyList,index),sep=sep,mkname=mkname))); ## must be careful to translate out-of-bounds to NULL; happens automatically with string keys, but not with integer indexes
    } else if (type%in%c('raw','logical','integer','double','complex','character')) { ## atomic leaf node; build column
        lenList <- do.call(c,lapply(tlList,`[[`,'len'));
        maxLen <- max(lenList,na.rm=T);
        if (is.null(sep)) {
            ret <- lapply(seq_len(maxLen),function(i) setNames(data.frame(sapply(nodes,function(node) if (length(node) < i) NA else node[[i]]),...),mkname(c(keyList,i),maxLen)));
        } else {
            ## keep original type if maxLen is 1, IOW don't stringify
            ret <- list(setNames(data.frame(sapply(nodes,function(node) if (length(node) == 0L) NA else if (maxLen == 1L) node else paste(collapse=sep,node)),...),mkname(keyList,maxLen)));
        }; ## end if
    } else stop(sprintf('error: unsupported type %s at %s.',type,keyListToStr(keyList)));
    if (is.null(ret)) ret <- list(); ## handle corner case of exclusively empty sublists
    ret;
}; ## end extractLevelColumns()
## simple interface function
flattenList <- function(mainList,...) do.call(cbind,extractLevelColumns(mainList,...));

执行:

## define data
mylist <- list(structure(list(Hit='True',Project='Blue',Year='2011',Rating='4',Launch='26 Jan 2012',ID='19',Dept='1, 2, 4'),.Names=c('Hit','Project','Year','Rating','Launch','ID','Dept')),structure(list(Hit='False',Error='Record not found'),.Names=c('Hit','Error')),structure(list(Hit='True',Project='Green',Year='2004',Rating='8',Launch='29 Feb 2004',ID='183',Dept='6, 8'),.Names=c('Hit','Project','Year','Rating','Launch','ID','Dept')));

## run it
df <- flattenList(mylist);
## extractLevelColumns():
## extractLevelColumns(): Hit
## extractLevelColumns(): Project
## extractLevelColumns(): Year
## extractLevelColumns(): Rating
## extractLevelColumns(): Launch
## extractLevelColumns(): ID
## extractLevelColumns(): Dept
## extractLevelColumns(): Error

df;
##     Hit Project Year Rating      Launch   ID    Dept            Error
## 1  True    Blue 2011      4 26 Jan 2012   19 1, 2, 4             <NA>
## 2 False    <NA> <NA>   <NA>        <NA> <NA>    <NA> Record not found
## 3  True   Green 2004      8 29 Feb 2004  183    6, 8             <NA>

我的函数比 1.9.6 更强大,因为它可以跨分支处理任意数量的嵌套级别和不同的向量长度。在链接的问题中,我的函数正确地将 OP 的列表展平为 data.frame,但失败并显示 .data.table::rbindlist()data.table::rbindlist()"Error in rbindlist(jsonRList, fill = T) : Column 4 of item 16 is length 2, inconsistent with first column of that item which is length 1. rbind/rbindlist doesn't recycle as it already expects each item to be a uniform list, data.frame or data.table"

评论

1赞 jcarlos 9/8/2016
哇,我终于找到了一个解决方案来扁平化我面临的列表类型。谢谢。
3赞 dca 3/29/2018
在一个复杂的列表中尝试了这个,得到了:Error in extractLevelColumns(lapply(nodes, function(node) if (length(node) < : error: inconsistent types () at /V1/2.
1赞 bgoldst 4/16/2018
@GabrielFair(和@dca)如果您发布指向列表的链接(例如在 GitHub 上),我可能会调试和改进我的代码以处理您的列表,或者至少改进错误消息以使其更具描述性/更清晰。
1赞 Gabriel Fair 4/17/2018
谢谢,对不起,我应该说得更清楚。我在帖子底部遇到与你相同的错误,你说你不能扁平化OP的列表。我将创建一个新的 SO 问题,如果我不能自己让它工作。再次感谢
1赞 JLC 2/21/2019
我还收到一个错误:Error in extractLevelColumns(lapply(nodes, [[, key), ..., keyList = c(keyList, : error: inconsistent types
5赞 ishonest 6/16/2021 #4

这是一个将任何嵌套/不均匀列表转换为数据帧的解决方案。 在许多情况下不起作用,尤其是对于列表列表。所以我必须创造比.rbindlistrbindlist

rbindlist.v2 <- function(l)
{
   l <- l[lapply(l, class) == "list"]
   df <- foreach(element = l, .combine = bind_rows, .errorhandling = 'remove') %do%
         {df = unlist(element); df = as.data.frame(t(df)); rm(element); return(df)}
   rm(l)
   return(df)
}

对于大型列表,可以通过将 替换为 来加快该过程。这也是我的案件所需要的。%do%%dopar%

评论

0赞 Jakub.Novotny 1/21/2023
这是一个非常酷的功能。您能否解释一下该功能的工作原理?我试图了解它是如何工作的,但这对我来说并不完全简单。
1赞 Ben 5/12/2023
函数在哪里定义?%do%
0赞 anpami 6/1/2023
-operator 似乎来自库 。无论如何,该代码在我的情况下不起作用 - 它生成一行> 489.000 列。%do%foreach
1赞 Aaron C 3/14/2023 #5

备选方案@ishonest:

df <- purrr::map_dfr(l,function(y){
  y[[1]]
})

下面是一些其他方法,具体取决于列表的嵌套方式:

列表列表

df <- purrr::map_dfr(r,function(x){
    unlist(x)
})

如果嵌套列表更复杂,其中某些元素是列表:

format_json_list <- function(r){
  purrr::map_dfr(r,function(x){
    #Base object, e.g. Vessel info
    b <- x[[1]]
    # object's events, e.g. paces Vessel visited 
    df <- purrr::map_dfr(x[[2]],function(y){
        v <- y[[1]]
        p <- y[2:length(y)]
        dplyr::bind_cols(v,p)
    })
    dplyr::bind_cols(b,df)
  })
}

对于一些复杂的 JSON,重复的变量命名可能是一个问题。一种解决方法是指定命名。下面的代码是对名称进行硬编码。我认为这可以变得动态。

purrr::map_dfr(vo, function(vessels){
      if(is.list(vessels)){
        vinfo <- purrr::map_dfr(vessels, function(vessel){
          if(!is.list(vessel)){
            #print(vessel)
            vessel
          }
        }) %>%  dplyr::rename_all(~ paste0("Vessel.", .))
        calinfo <- purrr::map_dfr(vessels$Callings, function(calling){
          if(is.list(calling)){
            call <- purrr::map_dfr(calling, function(call){
              if(!is.list(call)){
                call
              }
            })
            callport <- purrr::map_dfr(calling$Port, function(port){
              if(!is.list(port)){
                port
              }
            }) %>% dplyr::rename_all(~ paste0("Port.", .))
            dplyr::bind_cols(call, callport)
          }
        }) %>%  dplyr::rename_all(~ paste0("Calling.", .))
        bind_cols(vinfo, calinfo)
      } 
    }, .id ="Vessel" ) 
  })
1赞 Fredrik Karlsson 4/19/2023 #6

如果你喜欢:purrr

> te <- list(structure(list(Hit = "True", Project = "Blue", Year = "2011", 
Rating = "4", Launch = "26 Jan 2012", ID = "19", Dept = "1, 2, 4"), .Names = c("Hit", "Project", "Year", "Rating", "Launch", "ID", "Dept")), structure(list(
Hit = "False", Error = "Record not found"), .Names = c("Hit", 
"Error")), structure(list(Hit = "True", Project = "Green", Year = "2004", 
Rating = "8", Launch = "29 Feb 2004", ID = "183", Dept = "6, 8"), .Names = c("Hit", "Project", "Year", "Rating", "Launch", "ID", "Dept")))

> str(te)
List of 3
 $ :List of 7
  ..$ Hit    : chr "True"
  ..$ Project: chr "Blue"
  ..$ Year   : chr "2011"
  ..$ Rating : chr "4"
  ..$ Launch : chr "26 Jan 2012"
  ..$ ID     : chr "19"
  ..$ Dept   : chr "1, 2, 4"
 $ :List of 2
  ..$ Hit  : chr "False"
  ..$ Error: chr "Record not found"
 $ :List of 7
  ..$ Hit    : chr "True"
  ..$ Project: chr "Green"
  ..$ Year   : chr "2004"
  ..$ Rating : chr "8"
  ..$ Launch : chr "29 Feb 2004"
  ..$ ID     : chr "183"
  ..$ Dept   : chr "6, 8"
> purrr::map_dfr(te,as_tibble)
# A tibble: 3 × 8
  Hit   Project Year  Rating Launch      ID    Dept    Error           
  <chr> <chr>   <chr> <chr>  <chr>       <chr> <chr>   <chr>           
1 True  Blue    2011  4      26 Jan 2012 19    1, 2, 4 NA              
2 False NA      NA    NA     NA          NA    NA      Record not found
3 True  Green   2004  8      29 Feb 2004 183   6, 8    NA