如何检查 R 中包含混合字母数字索引的列中的重叠-解网

问：

我有一个数据帧，其中一列包含由字母后跟 3 位数字组成的代码字符串。我想检查各行之间的代码是否存在重叠。

例如（请注意，row1 和 row2 之间有一个重叠（B258）：

df <- data.frame(ICD_code = c("A581-A589, B200-B299, B354-B355", "B258, I230-I233, J201-J300, K353", "C001-C005, C020-C023, C781"),
                 category = c(1, 2, 3))

df

                              ICD_code category
    1  A581-A589, B200-B299, B354-B355        1
    2 B258, I230-I233, J201-J300, K353        2
    3       C001-C005, C020-C023, C781        3

为此，我尝试制作一个函数来对“A581-A589”进行序列，增量为 1，但这不起作用，因为 seq（）仅适用于数值。我想知道是否有人知道如何使用字母数字值制作序列？

我想要的结果：

                          ICD_code category overlap
1  A581-A589, B200-B299, B354-B355        1    T
2 B258, I230-I233, J201-J300, K353        2    T
3       C001-C005, C020-C023, C781        3    F

r 字符串序列 data.table

df <- data.frame(
  ICD_code = c("A581-A589, B200-B299, B354-B355",
               "B258, I230-I233, J201-J300, K353",
               "C001-C005, C020-C023, C781"),
  category = c(1, 2, 3)
)
library(tidyverse)


(df2 <- separate_longer_delim(df,
  cols = ICD_code,
  delim = ","
) |>
  separate_wider_delim(
    cols = ICD_code, delim = "-",
    names_sep = "x",
    too_few = "align_start"
  ) |> mutate(
    across(1:2,trimws),
    across(1:2,
      parse_number,
      .names = "num_{col}"
    ),
    first_letter = substr(ICD_codex1, 1, 1)
  ) |> rowwise() |>
  mutate(numseq = list(seq(
    from = num_ICD_codex1,
    to = pmax(num_ICD_codex1, num_ICD_codex2, na.rm = TRUE),
    by = 1))))

(df3 <- split(df2, ~first_letter))

(df4 <- map_lgl(
  df3,
  \(x){anyDuplicated(unlist(x$numseq)) > 0}
))

library(data.table)

DF <- data.frame(ICD_code = c("A581-A589, B200-B299, B354-B355",
                              "B258, I230-I233, J201-J300, K353",
                              "C001-C005, C020-C023, C781"),
                 category = c(1, 2, 3))

setDT(DF)
DT <- copy(DF)
DT <- DT[, .(ICD_code = unlist(strsplit(ICD_code, ", "))), by = category][, prefix := substring(ICD_code, 1, 1)][, suffix := lapply(strsplit(ICD_code, "-"), function(x){if(length(x) > 1){do.call(seq, as.list(as.integer(substring(x, 2))))} else {as.numeric(substring(x, 2))}})]
DT <- DT[, .(suffix = as.character(unlist(suffix))), by = .(category, ICD_code, prefix)][, ICD_code := paste0(prefix, suffix)][, overlap := (duplicated(ICD_code) | duplicated(ICD_code, fromLast=TRUE))]
DF <- merge.data.table(DF, DT[overlap == TRUE, .(category, overlap)], by = "category", all.x = TRUE)[is.na(overlap), overlap := FALSE]
DF[]

结果：

   category                         ICD_code overlap
1:        1  A581-A589, B200-B299, B354-B355    TRUE
2:        2 B258, I230-I233, J201-J300, K353    TRUE
3:        3       C001-C005, C020-C023, C781   FALSE

0赞 Juan P FZ 10/5/2023 #3

这是我的代码：

    df <- data.frame(ICD_code = c("A581-A589, B200-B299, B354-B355", "B258, I230-I233, J201-J300, K353", "C001-C005, C020-C023, C781"),
                 category = c(1, 2, 3))

    df

首先，确定要扩展的不同范围

    library(tidyverse)
    
    
    exp_df = df |> 
      rowid_to_column(var = "original_row") |> 
      separate_longer_delim(cols = ICD_code,delim = ",") |> 
      mutate(ICD_code = str_trim(ICD_code),
             expand_check = str_detect(ICD_code,"-"),
             ICD_code_exp_code = ifelse(expand_check,
                                        str_extract(ICD_code,"[A-Z]"),
                                        "A"), 
             ICD_code_exp_min = ifelse(expand_check,
                                       str_extract(ICD_code,"\\d*(?=-)"),
                                       0),
             ICD_code_exp_max = ifelse(expand_check,
                                       str_extract(ICD_code,"(?<=-[A-Z])\\d*"),
                                       0)) |> 
      mutate(across(c(ICD_code_exp_min,ICD_code_exp_max),as.integer))

这将扩展范围并将它们存储在列中

    # Producing a expanded list of codes
    exp_df$ICD_code_exp = sapply(1:nrow(exp_df),
                                 FUN = function(x){
                                   paste(paste0(exp_df$ICD_code_exp_code[x],
                                                seq(exp_df$ICD_code_exp_min[x],
                                                    exp_df$ICD_code_exp_max[x],
                                                    1)),
                                         collapse = ",")
                                   }
        )
    
    # Replacing the ones that do not need to be expanded
    exp_df$ICD_code_exp[!exp_df$expand_check] = exp_df$ICD_code[!exp_df$expand_check]

通过计数出现的不同行来检测重叠。ICD_code

    overlaps = exp_df |>
      separate_longer_delim(ICD_code_exp,delim = ",") |> 
      mutate(n_dist_rows = n_distinct(original_row),.by = ICD_code_exp) |> 
      summarise(overlap = max(n_dist_rows)>1,
                .by = c(original_row))

输出将是

    df |> 
      rowid_to_column(var = "original_row") |>
      left_join(overlaps,by = join_by(original_row)) |
      select(-original_row)

      #                           ICD_code category overlap
      # 1  A581-A589, B200-B299, B354-B355        1    TRUE
      # 2 B258, I230-I233, J201-J300, K353        2    TRUE
      # 3       C001-C005, C020-C023, C781        3   FALSE

上一个：LSTM 模型没有改进

下一个：在 Python 上过滤重复序列 [duplicate]

如何检查 R 中包含混合字母数字索引的列中的重叠

How to check overlap in a column that contains mixed letter-number index in R

评论

评论