提问人:CLARA 提问时间:8/22/2023 最后编辑:DubukayCLARA 更新时间:8/24/2023 访问量:65
R 中两个数据帧之间的数字不精确匹配
Inexact matching by number between two data frames in R
问:
使用以下代码,我打算匹配两个包含电话呼叫的文件。一个文件属于客户端,另一个文件属于提供程序。通过号码呼叫 (ANUM)、接听呼叫的号码 (BNUM) 和呼叫开始的时刻 (SECONDS) 进行匹配。变量 SECONDS 表示以秒为单位的特定时间点。也就是说,秒+分钟60+小时60*60,如果分析了几个不同的日期,也可能包括一天。
变量 SECONDS 在客户和供应商之间最多可以相差 X,例如 5 秒。也就是说,客户在与提供商最多相差 5 秒的时间点从号码 A 到号码 B 的呼叫应被视为同一呼叫并匹配。
我要获取的输出文件是:
使用客户端文件的列进行Matcheted调用。
使用供应商文件的列进行匹配调用。目标是将此文件与前一个文件连接起来(例如,将其粘贴到 excel 中,一个挨着另一个),匹配完成后,比较调用的持续时间,看看持续时间在哪里有差异。也就是说,此文件 2 必须与文件 1 具有相同的维度。
客户端具有但不具有提供程序的调用。
提供程序具有但客户端没有的调用。
我展示的代码一定在某处有错误,因为当我尝试使用示例时,我在文件 1 和 2 中没有得到相同的维度。
library("readxl")
library("dplyr")
library("writexl")
datavendor <- read_excel("CDRS_VENDOR_1MARCH.xlsx")
dataclient <- read_excel("CDRS_CLIENT_1MARCH.xlsx")
# Perform type conversions
dataclient$ANUM <- as.numeric(dataclient$ANUM)
datavendor$ANUM <- as.numeric(datavendor$ANUM)
dataclient$BNUM <- as.numeric(dataclient$BNUM)
datavendor$BNUM <- as.numeric(datavendor$BNUM)
dataclient$SECONDS <- as.numeric(dataclient$SECONDS)
datavendor$SECONDS <- as.numeric(datavendor$SECONDS)
# Define the function to perform join and assignment operations
match_and_assign <- function(data1, data2, time_offset) {
for (i in 0:time_offset) {
temp_data2 <- data2 %>%
mutate(SECONDS = SECONDS + i)
ok_data <- semi_join(data1, temp_data2, by = c("ANUM", "BNUM", "SECONDS"))
anti_data1 <- anti_join(data1, temp_data2, by = c("ANUM", "BNUM", "SECONDS"))
data1 <- anti_data1
if (i == 0) {
match_data <- ok_data
} else {
match_data <- rbind(match_data, ok_data)
}
}
return(list(match_data = match_data, unmatched_data = data1))
}
# Define the maximum value of time deviation
max_time_offset <- 5
# Perform union and assignment operations with a definite function
result_vendor <- match_and_assign(datavendor, dataclient, max_time_offset)
result_client <- match_and_assign(dataclient, datavendor, max_time_offset)
# Accessing the desired results
okvendor <- result_vendor$match_data
okclient <- result_client$match_data
unmatched_vendor <- result_vendor$unmatched_data
unmatched_client <- result_client$unmatched_data
# Write the files
write_xlsx(okclient,"/Users/Downloads/Matched_client.xlsx")
write_xlsx(okvendor,"/Users/Downloads/Matched_vendor.xlsx")
write_xlsx(unmatched_client,"/Users/Downloads/Unmatched_client.xlsx")
write_xlsx(unmatched_vendor,"/Users/Downloads/Unmatched_vendor.xlsx")
dput(head(dataclient))
structure(list(`Event time` = structure(c(1677628826, 1677628847, 1677628909, 1677628880, 1677628849, 1677628874), tzone = "UTC", class = c("POSIXct", "POSIXt")), ANUM = c(20795883477, 22765981865, 21785847031, 22774133419, 22784534311, 25784234311), BNUM = c(26774521101, 20785899196, 22734269069, 22795950021, 28794493816, 22715594275), SECONDS = c(3244, 3265, 3327, 3298, 3267, 3292), HORA = c(0, 0, 0, 0, 0, 0), MINUTO = c(0, 0, 1, 1, 0, 1), SEGUNDO = c(26, 47, 49, 20, 49, 14)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
dput(head(datavendor))
structure(list(BNUM = c(21795985477, 21785961865, 22735847031, 22774137419, 25788234311, 22724234311), ANUM = c(22776521101, 22787899196, 23764269869, 21793650021, 22794443816, 22745594275), connecttime = c("2023-03-01 00:00:37.084+00", "2023-03-01 00:00:45.091+00", "2023-03-01 00:01:03.677+00", "2023-03-01 00:01:08.593+00", "2023-03-01 00:01:15.838+00", "2023-03-01 00:01:18.033+00"), SECONDS = c(37, 45, 63, 68, 75, 78), HORA = c("00", "00", "00", "00", "00", "00"), MINUTO = c("00", "00", "01", "01", "01", "01"), SEGUNDO = c("37", "45", "03", "08", "15", "18")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
答:
对于包来说,这感觉是一个非常好的问题,它完全执行了您有兴趣执行的那种不精确的合并。对于一些演示数据(如注释中所述,示例数据不够全面,无法演示连接):fuzzyjoin
library(fuzzyjoin)
dataclient <- data.frame(SECONDS=(1:15)*10,
ANUM=trunc(runif(15)*1e9),
BNUM=trunc(runif(15)*1e9))
datavendor <- data.frame(SECONDS=round((0:10)*10+runif(11)*5),
ANUM=trunc(runif(11)*1e9),
BNUM=trunc(runif(11)*1e9))
difference_full_join(dataclient, datavendor, by="SECONDS", max_dist=5)
其中,我们将 SECONDS 列设置为具有大约 5 的差异,然后使用 5 进行合并,以便恢复连接:max_dist
SECONDS.x ANUM.x BNUM.x SECONDS.y ANUM.y BNUM.y
1 10 800645539 634349609 13 193615068 900241998
2 20 995521628 722356639 25 785895029 360213103
3 30 114639543 797989587 25 785895029 360213103
4 30 114639543 797989587 35 237932417 214477707
5 40 667134090 312174350 35 237932417 214477707
6 40 667134090 312174350 45 195937151 966350245
7 50 405507121 995470385 45 195937151 966350245
8 50 405507121 995470385 53 83245561 373993374
9 60 901356423 243821657 63 293882356 178900313
10 70 1462034 824282414 72 857817749 10435715
11 80 272650450 845019055 82 917233256 214198129
12 90 461754051 36351310 92 592704539 518927375
13 100 15183958 886851646 101 438603267 884772811
14 110 490163355 483237234 NA NA NA
15 120 743737922 537729894 NA NA NA
16 130 625128402 294578106 NA NA NA
17 140 608192171 635717906 NA NA NA
18 150 262778125 993660240 NA NA NA
19 NA NA NA 4 872109025 453920802
在这种情况下,客户端具有但供应商没有的调用将在 .y 列中显示为 NA,而供应商具有但客户端没有的调用将在 .x 列中显示为 NA。
评论
dput(head(dataclient))
dput(head(datavendor))
dataclient |> mutate(SECONDS_high = SECONDS + 5) |> left_join(datavendor, join_by(ANUM, BNUM, SECONDS <= SECONDS, SECONDS_high >= SECONDS))