do.call(rbind, ... ) 在循环中表现出意外行为

do.call(rbind, ... ) exhibits unexpected behaviour inside a loop

提问人:Illimar Rekand 提问时间:7/1/2023 更新时间:7/2/2023 访问量:33

问:

我正在尝试下载一些数据并拥有以下内容

library(RJSONIO)

first.day <- as.Date(paste0("202201","01"), "%Y%m%d")
today <- Sys.Date()
months <- format(seq(first.day, today,by="month"), "%Y%m")
length(months)

months.df.list <- vector("list", length = length(months))

for (m in months){
  print(m)
  filename <- paste0("./monthly_bike_data/", m, ".csv")
  query <- paste0("https://data.urbansharing.com/bergenbysykkel.no/trips/v1/", substr(m,0,4),"/", substr(m,5,6), ".json")
  month.df.raw <- fromJSON(query) #retrieve data
  month.df <- do.call(rbind,month.df.raw) # convert to a proper dataframe
  write.csv(month.df, filename)
  
}

但是,当我运行 class(month.df) 时,我得到

[1] "list"

.当我简单地运行代码时

twentytwenty.raw <- fromJSON("https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2020/01.json")
twentytwenty <- do.call(rbind, twentytwenty.raw)
class(twentytwenty)

在循环之外,我得到

[1] “矩阵”“数组”

据我所知,这里唯一的区别是 do.call() 在循环内部运行而不是在循环外部运行,并且似乎它改变了输出行为。

有谁知道为什么会这样,并提出修复建议?

r rbind rjsonio fromjson

评论

2赞 user2554330 7/1/2023
您每次都在循环中覆盖。你在看哪一个?由于代码对我有用,我猜这是一个下载问题:也许其中一个文件不存在,或者网站阻止了您下载。month.df

答:

1赞 margusl 7/2/2023 #1

假设它更多的是获取这些文件而不是循环,我建议采用不同的方法。前几点评论:do.call()

因此,我将从 S3 ListBucketResult XML 构建数据帧,并使用这些 URL 直接获取 CSV 文件。对不起,tidverse 重载..

library(xml2)
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
library(lubridate)

# S3 ListBucketResult xml
bucket_url <- "https://data.urbansharing.com/"
urbansharing_xml <- read_xml(bucket_url) %>% xml_ns_strip()

# xml -> dataframe with file dates and urls, for both json and csv
trip_data <- urbansharing_xml %>% 
  xml_find_all(".//Contents") %>% 
  as_list() %>% 
  bind_rows() %>% 
  unnest(cols = c(Key, LastModified, Size))  %>% 
  select(!where(is.list)) %>% 
  mutate(month = str_extract(Key, "\\d{4}/\\d+(?=\\.)") %>% ym(), 
         url = paste0(bucket_url, Key) %>% tibble::char(shorten = "front"),
         .keep = "unused",
         .before = 1)

# select only bergenbysykkel csv files, since 2022-01-01 
bergen_csv <- trip_data %>% 
  filter(str_detect(url, "bergenbysykkel.*csv"),
         month >= date("2022-01-01"))
print(bergen_csv, n = 5)
#> # A tibble: 19 × 4
#>   month      url                                              LastModified Size 
#>   <date>     <char>                                           <chr>        <chr>
#> 1 2022-01-01 …ring.com/bergenbysykkel.no/trips/v1/2022/01.csv 2022-02-28T… 6050…
#> 2 2022-02-01 …ring.com/bergenbysykkel.no/trips/v1/2022/02.csv 2022-03-31T… 6531…
#> 3 2022-03-01 …ring.com/bergenbysykkel.no/trips/v1/2022/03.csv 2022-04-30T… 1155…
#> 4 2022-04-01 …ring.com/bergenbysykkel.no/trips/v1/2022/04.csv 2022-05-31T… 1148…
#> 5 2022-05-01 …ring.com/bergenbysykkel.no/trips/v1/2022/05.csv 2022-06-30T… 1296…
#> # ℹ 14 more rows

# fetch first 3 files, set local names to year_month.csv; 
# local file names are stored as vector names for iwalk
fetch_list <- bergen_csv$url[1:3] %>% 
  set_names(str_replace(., ".*(\\d{4})/(.*)", "\\1_\\2"))
fetch_list
#> <pillar_char<[3]>
#>                                                          2022_01.csv 
#> https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2022/01.csv 
#>                                                          2022_02.csv 
#> https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2022/02.csv 
#>                                                          2022_03.csv 
#> https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2022/03.csv

# walk through fetch_list, download all files
# iwalk() passes each value(url) and name(local file name) to its function 
iwalk(fetch_list, \(url, filename) download.file(url, filename, mode = "wb"))
fs::dir_info(glob = "2022*.csv")[,1:3]
#> # A tibble: 3 × 3
#>   path        type         size
#>   <fs::path>  <fct> <fs::bytes>
#> 1 2022_01.csv file        5.77M
#> 2 2022_02.csv file        6.23M
#> 3 2022_03.csv file       11.02M

# import all downloaded csv file to a single  dataframe, 
# filename column stores source file name
readr::read_csv(names(fetch_list), id = "filename", show_col_types = FALSE)
#> # A tibble: 104,998 × 14
#>    filename    started_at          ended_at            duration start_station_id
#>    <chr>       <dttm>              <dttm>                 <dbl>            <dbl>
#>  1 2022_01.csv 2022-01-01 04:01:22 2022-01-01 04:19:41     1099               33
#>  2 2022_01.csv 2022-01-01 04:01:37 2022-01-01 04:19:38     1081               33
#>  3 2022_01.csv 2022-01-01 04:02:24 2022-01-01 04:10:08      463             1899
#>  4 2022_01.csv 2022-01-01 04:04:04 2022-01-01 04:06:10      125              644
#>  5 2022_01.csv 2022-01-01 04:11:38 2022-01-01 04:23:24      705              368
#>  6 2022_01.csv 2022-01-01 04:45:47 2022-01-01 04:54:35      528             1887
#>  7 2022_01.csv 2022-01-01 05:20:44 2022-01-01 05:27:50      425              814
#>  8 2022_01.csv 2022-01-01 06:09:17 2022-01-01 06:16:29      431              217
#>  9 2022_01.csv 2022-01-01 06:37:29 2022-01-01 06:54:24     1014             1816
#> 10 2022_01.csv 2022-01-01 06:40:14 2022-01-01 06:48:20      486              806
#> # ℹ 104,988 more rows
#> # ℹ 9 more variables: start_station_name <chr>,
#> #   start_station_description <chr>, start_station_latitude <dbl>,
#> #   start_station_longitude <dbl>, end_station_id <dbl>,
#> #   end_station_name <chr>, end_station_description <chr>,
#> #   end_station_latitude <dbl>, end_station_longitude <dbl>

创建于 2023-07-01 with reprex v2.0.2