gsub for 循环创建重复的字符串

gsub for loop creates a repeated character string

提问人:Rizky Merdietio 提问时间:9/24/2023 更新时间:9/24/2023 访问量:33

问:

我有这个数据帧:

test2 <- structure(list(AU = c("MEISYARA D;GUSWENRIVO I;SINGHAM GV", "VANGALA RM;LOSHALI A;BASA KS;CH G;MASTHAN S;GANACHARI BC;MUNGALA SR;TADAKAMADLA J;TADAKAMADLA SK;BALLA SB", 
                               "SEBASTIAN S;FRANCO A;MÂNICA S", "HUO D-M;MAO X-Y;MO W-W;ZHAO F-M;DU M;SUN R-R", 
                               "KIHARA EN;KARANJA SM;WANZALA P;WAGAIYU EG", "LIANG CT;HIGGINS D;ASHAR A", 
                               "SINDI MA;AL-SEBAEI MO;BAMASHMOUS MS", "PALOMINO-SOTO M;CARRANZA-SAMANEZ K;DULANTO-VARGAS J;QUEZADA-MARQUEZ MM;FONSECA GM;RAMIREZ-WONG F", 
                               "PANCI V;HACKMAN L", "STEVENS DW;DUNN MR;MILLS VS;BOWDEN DA;MCMILLAN PJ;HART AC;CHIN C;DAVEY NK;PINKERTON MH", 
                               "ŠVÁBOVÁ NEE UHROVÁ P;BEŇUŠ R;CHOVANCOVÁ NEE KONDEKOVÁ M;VOJTUŠOVÁ A;NOVOTNÝ M;THURZO A", 
                               "RIAZ S;KHAMIS MF;ABDULLAH JY;AHMAD WMAW;ALAM MK", "FAN F;KE W;DAI X;SHI L;LIU Y;LIN Y;CHENG Z;ZHANG Y;CHEN H;DENG Z", 
                               "CASTILLO-ALONSO C;TABILO L;LÓPEZ-LÁZARO S", "ROBERTS G;LUCAS VS;CAMILLERI S;JAYARAMAN J;KASPER KA;LEWIS JM", 
                               "BU W-Q;GUO Y-X;ZHANG D;DU S-Y;HAN M-Q;WU Z-X;TANG Y;CHEN T;GUO Y-C;MENG H-T", 
                               "TANAKA S;KARIBE H;KATO Y;KOMATSUZAKI A;SEKIMOTO T;SHIMOMURA-KUROKI J", 
                               "KLINGBERG G;BENCHIMOL D;BERLIN H;BRING J;GORNITZKI C;ODEBERG J;TRANÆUS S;TWETMAN S;WERNERSSON E;ÖSTLUND P;DOMEIJ H", 
                               "TSOGTSAIKHAN K;HATANO Y;KOSAKA M;YOSHIDA K;MINJUUR T;GARIDKHUU A;SASAKI K;SUZUKI T", 
                               "LOPATIN O;BARSZCZ M;WOŹNIAK KJ", "SINGH M", "VILA-BLANCO N;VARAS-QUINTANA P;TOMÁS I;CARREIRA MJ", 
                               "GIRIJAN P;BOEDI R;MÂNICA S;FRANCO A", "KAVYA S;PRABHU S;ADYANTHYA S;MIQDAD S;ABDULLA R;JAYARAJAN D", 
                               "HINTON MS;MCMILLAN BR;HERSEY KR;LARSEN RT", "HASSAN A;ELHOSENY M;KAYED M", 
                               "JAYAPRIYA T;KELUSKAR V;SRIDHAR M;LOKESH KUMAR S;FERNANDES A", 
                               "NUZZOLESE E;MALERBA G;VELLA GD", "BARRONE J;VIDAL MC;STEVENSON R", 
                               "TIMME M;VIKTOROV J;STEFFENS L;STREETER A;KARCH A;SCHMELING A", 
                               "ALWOHAIBI RN;ALMAIMONI RA;ALSHREFY AJ;ALMUSAILET LI;ALHAZZAA SA;MENEZES RG", 
                               "EL-DESOUKY SS;KABBASH IA", "ANDERSEN IL;OCEPEK M;THINGNES SL;NEWBERRY RC", 
                               "WANG C;TIAN ZK;WEN D;QU W;XU R;LIU Y;JIA H;TANG X;LI J;ZHA L;LIU Y", 
                               "MORGAN J", "SALAZAR-VALENZUELA L;LÓPEZ-LÁZARO S;AGUAYO-CÁDIZ JE;CAPITANEANU C;FONSECA GM", 
                               "MAHABALA KY;NATARAJAN S;MAHMOOD MT;NAYAK AP;RAO A", "BJELOPAVLOVIC M;REDER SR;FRITZEN I;BROCKMANN MA;HARDT J;PETROWSKI K", 
                               "LORETO DBL;BARROS BÁCD", "WALIA M;BHATI K;GULLAIYA J", "ARROYO-BOTE S;MARTÍNEZ-ARROYO C;GALLEGO-ÁLVAREZ MÁ;ARROYO-BOTE C;MANZANARES-CÉSPEDES MC", 
                               "SOORNEEDI N;VENKATACHALAIAH A;MANIKYA S;DASARI A;GADDAM B;LATHA A", 
                               "PALMELA PEREIRA CM;DOS SANTOS AR;GONÇALVES CR;NUSHI V;COUTINHO F;SALVADO E SILVA FJ;DE SOUSA SANTOS RFV", 
                               "JAWANDA MK;GUPTA S;SANDHU H;ESCOBEDO RLO;BHULLAR HS;HAMZA M", 
                               "HARUDIN MH;FRANCO A;JAFFAR N;NOOR MHM;IBRAHIM MA;MANICA S", 
                               "KUHNEN B;FERNANDES CMS;BARROS F;SCARSO FILHO J;GONÇALVES M;SERRA MC", 
                               "VODANOVIĆ M;SUBAŠIĆ M;MILOŠEVIĆ DP;GALIĆ I;BRKIĆ H", 
                               "RAMKUMAR J;GANESH R;VINAY J", "RIVERA-MENDOZA F;ESPINOZA-SILVA PV;FONSECA GM", 
                               "PEREIRA CP;SANTOS R;NUSHI V;LAMEIRO MV;ANTUNES P;CARVALHO R;MAJOR T;ALQAHTANI SJ"
), Author.s..ID = c("57211806807; 57192983097; 36952129500", 
                    "58235484500; 58236398600; 58235265200; 57217734842; 58235484600; 58236171600; 58235043100; 36183085400; 57207799926; 56878742000", 
                    "58235287100; 57205450144; 57189010056", "57215812605; 58519454000; 57924026600; 57924660300; 57829018800; 58519185900", 
                    "57195633934; 6506257821; 6508066300; 6602243600", "58195847100; 15135714000; 58195400400", 
                    "57218218739; 6602870096; 57190741097", "58307513800; 58308177600; 58022245400; 57192995591; 35190337400; 35520201500", 
                    "58484954800; 55185444800", "7401571955; 7402116738; 25651855700; 8552552500; 7101707095; 7201706201; 58303547300; 58569857500; 7003793212", 
                    "57416602300; 6701772603; 57416602400; 58080360400; 58081085000; 36028646100", 
                    "57203321439; 55357687600; 24833021600; 55163807800; 7401492628", 
                    "56727508100; 57218226768; 35932019300; 57203955437; 56668190400; 58081759100; 58082278400; 57840021300; 57189326786; 36839218900", 
                    "58544769500; 58544603300; 46861416800", "57061571400; 8042276500; 6701854111; 51565457300; 26532954500; 56764949400", 
                    "57391780400; 57188842428; 57203335815; 57215374469; 57203317671; 58191839100; 58192302000; 35236150000; 56009909700; 55916628500", 
                    "57194377552; 7003893659; 56539124800; 55318758500; 56441159900; 7004885391", 
                    "55900416400; 57207514529; 57194834663; 6701764799; 57039166200; 58352179300; 6602247508; 57203616204; 58182056400; 56730646000; 58478877100", 
                    "57428699300; 56797610300; 55957033800; 57429757800; 58141532500; 12761531400; 7404487919; 55731667900", 
                    "56898875600; 57224073300; 26640785000", "57195629545", "57205495992; 57217653844; 7005882178; 15061137300", 
                    "58136445000; 57210935896; 57189010056; 57205450144", "58540862700; 25522097000; 58540160600; 57423908600; 57195430892; 58540862800", 
                    "57783544200; 7006925639; 55206855800; 16245238300", "57220773969; 57148260400; 14422375000", 
                    "58074312500; 6504295343; 57204201957; 57268460000; 57901658300", 
                    "56056565700; 6602086533; 57198019376", "58507483800; 56237340500; 7402601773", 
                    "56072719400; 58132576500; 57297746300; 55147892100; 57223444197; 7003946207", 
                    "57904670600; 57904055400; 57904270300; 57904670700; 57904471800; 55517099900", 
                    "57194481538; 16835094000", "7101937964; 35750447700; 55330827100; 7005305192", 
                    "57209341512; 58108459600; 57203920354; 57221468725; 57919847100; 57857535000; 57919671100; 57914942600; 56928003400; 54786023300; 55850166100", 
                    "57700221200", "57539490700; 46861416800; 57540135000; 57195464981; 35190337400", 
                    "57552704200; 57903377000; 58562326200; 57194032731; 57191113816", 
                    "57222986450; 57338272500; 58451580100; 7003454067; 56236335200; 8904910000", 
                    "57209570307; 58453037100", "57325475900; 57465984000; 57396827200", 
                    "56203448200; 57855205700; 57855205800; 57855831000; 57205131975", 
                    "57219023505; 58537713700; 57189588884; 58537055700; 57559779600; 58537055800", 
                    "55433307900; 58186424700; 58070106900; 58185591700; 58185926500; 58186424800; 56979441000", 
                    "55556592000; 57213338080; 58144251500; 58295221000; 58292275300; 57218176879", 
                    "57192662543; 57205450144; 58157816800; 57193712434; 55358169200; 57189010056", 
                    "57221841980; 50261506600; 57221849486; 6504233232; 7202320517; 54917611300", 
                    "12446817300; 6508382225; 57211558771; 57202373100; 6701830546", 
                    "58236897700; 58237381400; 56631932700", "57194795258; 57777793500; 35190337400", 
                    "55433307900; 56979441000; 58185591700; 58145498900; 58145499000; 58145697100; 58145089200; 56450570300"
)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))

我想做的是确定 test2$AU 中有一个来自 1 test2$Author.s. 的不同名称。所以我制作了这个脚本:

library(tidyverse)
library(stringr)
au <- strsplit(test2$AU, ";")
author_ids <- strsplit(test2$Author.s..ID, ";")

max_elements <- max(lengths(au), lengths(author_ids))


au <- lapply(au, function(x) c(x, rep(NA, max_elements - length(x))))
author_ids <- lapply(author_ids, function(x) c(x, rep(NA, max_elements - length(x))))

author_data <- data.frame(
  AU = unlist(au, use.names = FALSE),
  Author.s..ID = unlist(author_ids, use.names = FALSE)
)

author_data <- na.omit(author_data)
author_data$Author.s..ID <- gsub(" ","", author_data$Author.s..ID)
author_data <- unique(author_data)


mismatches <- author_data %>%
  group_by(Author.s..ID) %>%
  summarize(AU_count = n_distinct(AU))


mismatches <- mismatches %>%
  filter(AU_count > 1)

if (nrow(mismatches) > 0) {
  print(mismatches)
} else {
  cat("No mismatches or variations found.")
}

然后我做一个 for 循环来替换所有不匹配的东西:

for (i in 1:nrow(mismatches)) {
  q <- mismatches[i, 1]
  rows <- author_data[which(author_data$Author.s..ID == as.character(q)), ]
  for (y in 1:nrow(rows)) {
    name_replaced <- rows$AU[2:y]
    name_fixed <- rows$AU[1]
    for (j in 1:nrow(test2)) {
      for (k in 1:length(name_replaced)) {
        if (grepl(name_replaced[k], test2$AU[j])) {
          test2$AU[j] <- gsub(name_replaced[k], name_fixed, test2$AU[j])
        } else {test2$AU[j] <- test2$AU[j]
        }
      }
    }
  }
}

我制作的 for 循环创建了一个奇怪的字符串子,它重复了字符,如果我尝试运行循环,我仍然不匹配并且输出仍然不正确。

author_data[which(author_data$Author.s..ID == "56979441000"),]

我哪里做错了?

谢谢!

r 字符串 for-loop gsub

评论

0赞 jkd 9/24/2023
亲爱的@Rizky Merdietio,(1)你能试着简化你的例子,让更多人关注实际问题吗?(2) 当 gsub 产生一个重复的字符串时,可能是因为你没有替换你认为要替换的东西(所以你可以试着把它分解一下),或者因为你替换了不止一次。(3) 我不明白实际目标('我想做的是确定 test2$AU 中有一个来自 1 test2$Author.s. 的不同名称。这句话是什么意思?
0赞 Rizky Merdietio 9/24/2023
@jkd 您好,感谢您的发言。我有点难以表达我的意思。所以我试图找到的是:1.找到test2$Author.s.。已绑定到多个 test2$AU 2 的 ID。如果发生这样的事件,我想统一来自 1 个作者 ID 的名称并将其合并为一个名称
1赞 jkd 9/24/2023
我认为我在下面提供的答案应该可以帮助您做到这一点。还有其他问题吗?

答:

1赞 jkd 9/24/2023 #1

如果你最终想做的是找到多次归因的 ID 和具有多个 ID 的作者,我建议你首先整理数据,每行有一个作者和作者 ID。然后,查找重复项就变得容易多了:

library(tidyverse)
# tidy up your data
test2 <- test2 |>
  mutate(ManuscriptID = row_number(), # I assumed these are articles or manuscripts?
         AU = map(AU, \(x) trimws(unlist(strsplit(x,";")))),
         Author.s..ID = map(Author.s..ID, \(x) trimws(unlist(strsplit(x,";"))))) |>
  unnest(c(AU, Author.s..ID))

# find Authors having multiple IDs
test2 |>
  group_by(AU) |>
  filter(n()>1) |>
  arrange(Author.s..ID)

# deal with Authors having multiple IDs
# ... insert your own code here

# find IDs attributed multiple times
test2 |>
  group_by(Author.s..ID) |>
  filter(n()>1) |>
  arrange(Author.s..ID)

# deal with IDs attributed multiple times
# ... insert your own code here

# restore the previous data structure
test2 |>
  group_by(ManuscriptID) |>
  summarise(across(everything(), ~ paste(.x, collapse="; ")))

在这两种情况下,我都由您决定如何更改重复的 ID。

1赞 SamR 9/24/2023 #2

你不需要在这里使用。我同意另一个答案,即您想将数据以长格式显示。我就是这样做的:gsub()

author_id <- data.frame(
    AU = unlist(strsplit(test2$AU, ";")),
    ID = trimws(unlist(strsplit(test2$Author.s..ID, ";"))),
    grp = cumsum(sequence(lengths(strsplit(test2$AU, ";"))) == 1)
)

该列稍后将用于将数据放回宽格式。grp

您的问题似乎是将作者姓名的拼写略有不同,并替换为该名称的第一个实例。例如:

filter(author_id, ID == "57189010056")
#        AU          ID grp
# 1 MÂNICA S 57189010056   3
# 2 MÂNICA S 57189010056  23
# 3 MANICA S 57189010056  45

当您使用 时,您可以非常简单地做到这一点:tidyverse

# Replace the author names with the first instance
author_id <- author_id |>
    group_by(ID) |>
    mutate(AU = AU[1])

这解决了问题,例如,使用前面的示例:

filter(author_id, ID == "57189010056")
#   AU       ID            grp
#   <chr>    <chr>       <int>
# 1 MÂNICA S 57189010056     3
# 2 MÂNICA S 57189010056    23
# 3 MÂNICA S 57189010056    45

现在我们可以将数据放回宽格式:

# Put the data back in wide form
ids_wide <- author_id |>
    split(~grp) |>
    lapply(\(df) data.frame(
        AU = paste(df$AU, collapse = ";"),
        `Author.s..ID` = paste(df$ID, collapse = "; ")
    )) |>
    bind_rows()

输出:

head(ids_wide)[-2,] # Line 2 is  long and wraps terminal
#                                             AU                                                                 Author.s..ID
# 1           MEISYARA D;GUSWENRIVO I;SINGHAM GV                                        57211806807; 57192983097; 36952129500
# 3                SEBASTIAN S;FRANCO A;MÂNICA S                                        58235287100; 57205450144; 57189010056
# 4 HUO D-M;MAO X-Y;MO W-W;ZHAO F-M;DU M;SUN R-R 57215812605; 58519454000; 57924026600; 57924660300; 57829018800; 58519185900
# 5    KIHARA EN;KARANJA SM;WANZALA P;WAGAIYU EG                              57195633934; 6506257821; 6508066300; 6602243600
# 6                   LIANG CT;HIGGINS D;ASHAR A                                        58195847100; 15135714000; 58195400400

评论

1赞 Rizky Merdietio 9/24/2023
谢谢!这很简单,但它解决了所有问题。我需要回去学习基础知识..