提问人:Rizky Merdietio 提问时间:9/24/2023 更新时间:9/24/2023 访问量:33
gsub for 循环创建重复的字符串
gsub for loop creates a repeated character string
问:
我有这个数据帧:
test2 <- structure(list(AU = c("MEISYARA D;GUSWENRIVO I;SINGHAM GV", "VANGALA RM;LOSHALI A;BASA KS;CH G;MASTHAN S;GANACHARI BC;MUNGALA SR;TADAKAMADLA J;TADAKAMADLA SK;BALLA SB",
"SEBASTIAN S;FRANCO A;MÂNICA S", "HUO D-M;MAO X-Y;MO W-W;ZHAO F-M;DU M;SUN R-R",
"KIHARA EN;KARANJA SM;WANZALA P;WAGAIYU EG", "LIANG CT;HIGGINS D;ASHAR A",
"SINDI MA;AL-SEBAEI MO;BAMASHMOUS MS", "PALOMINO-SOTO M;CARRANZA-SAMANEZ K;DULANTO-VARGAS J;QUEZADA-MARQUEZ MM;FONSECA GM;RAMIREZ-WONG F",
"PANCI V;HACKMAN L", "STEVENS DW;DUNN MR;MILLS VS;BOWDEN DA;MCMILLAN PJ;HART AC;CHIN C;DAVEY NK;PINKERTON MH",
"ŠVÁBOVÁ NEE UHROVÁ P;BEŇUŠ R;CHOVANCOVÁ NEE KONDEKOVÁ M;VOJTUŠOVÁ A;NOVOTNÝ M;THURZO A",
"RIAZ S;KHAMIS MF;ABDULLAH JY;AHMAD WMAW;ALAM MK", "FAN F;KE W;DAI X;SHI L;LIU Y;LIN Y;CHENG Z;ZHANG Y;CHEN H;DENG Z",
"CASTILLO-ALONSO C;TABILO L;LÓPEZ-LÁZARO S", "ROBERTS G;LUCAS VS;CAMILLERI S;JAYARAMAN J;KASPER KA;LEWIS JM",
"BU W-Q;GUO Y-X;ZHANG D;DU S-Y;HAN M-Q;WU Z-X;TANG Y;CHEN T;GUO Y-C;MENG H-T",
"TANAKA S;KARIBE H;KATO Y;KOMATSUZAKI A;SEKIMOTO T;SHIMOMURA-KUROKI J",
"KLINGBERG G;BENCHIMOL D;BERLIN H;BRING J;GORNITZKI C;ODEBERG J;TRANÆUS S;TWETMAN S;WERNERSSON E;ÖSTLUND P;DOMEIJ H",
"TSOGTSAIKHAN K;HATANO Y;KOSAKA M;YOSHIDA K;MINJUUR T;GARIDKHUU A;SASAKI K;SUZUKI T",
"LOPATIN O;BARSZCZ M;WOŹNIAK KJ", "SINGH M", "VILA-BLANCO N;VARAS-QUINTANA P;TOMÁS I;CARREIRA MJ",
"GIRIJAN P;BOEDI R;MÂNICA S;FRANCO A", "KAVYA S;PRABHU S;ADYANTHYA S;MIQDAD S;ABDULLA R;JAYARAJAN D",
"HINTON MS;MCMILLAN BR;HERSEY KR;LARSEN RT", "HASSAN A;ELHOSENY M;KAYED M",
"JAYAPRIYA T;KELUSKAR V;SRIDHAR M;LOKESH KUMAR S;FERNANDES A",
"NUZZOLESE E;MALERBA G;VELLA GD", "BARRONE J;VIDAL MC;STEVENSON R",
"TIMME M;VIKTOROV J;STEFFENS L;STREETER A;KARCH A;SCHMELING A",
"ALWOHAIBI RN;ALMAIMONI RA;ALSHREFY AJ;ALMUSAILET LI;ALHAZZAA SA;MENEZES RG",
"EL-DESOUKY SS;KABBASH IA", "ANDERSEN IL;OCEPEK M;THINGNES SL;NEWBERRY RC",
"WANG C;TIAN ZK;WEN D;QU W;XU R;LIU Y;JIA H;TANG X;LI J;ZHA L;LIU Y",
"MORGAN J", "SALAZAR-VALENZUELA L;LÓPEZ-LÁZARO S;AGUAYO-CÁDIZ JE;CAPITANEANU C;FONSECA GM",
"MAHABALA KY;NATARAJAN S;MAHMOOD MT;NAYAK AP;RAO A", "BJELOPAVLOVIC M;REDER SR;FRITZEN I;BROCKMANN MA;HARDT J;PETROWSKI K",
"LORETO DBL;BARROS BÁCD", "WALIA M;BHATI K;GULLAIYA J", "ARROYO-BOTE S;MARTÍNEZ-ARROYO C;GALLEGO-ÁLVAREZ MÁ;ARROYO-BOTE C;MANZANARES-CÉSPEDES MC",
"SOORNEEDI N;VENKATACHALAIAH A;MANIKYA S;DASARI A;GADDAM B;LATHA A",
"PALMELA PEREIRA CM;DOS SANTOS AR;GONÇALVES CR;NUSHI V;COUTINHO F;SALVADO E SILVA FJ;DE SOUSA SANTOS RFV",
"JAWANDA MK;GUPTA S;SANDHU H;ESCOBEDO RLO;BHULLAR HS;HAMZA M",
"HARUDIN MH;FRANCO A;JAFFAR N;NOOR MHM;IBRAHIM MA;MANICA S",
"KUHNEN B;FERNANDES CMS;BARROS F;SCARSO FILHO J;GONÇALVES M;SERRA MC",
"VODANOVIĆ M;SUBAŠIĆ M;MILOŠEVIĆ DP;GALIĆ I;BRKIĆ H",
"RAMKUMAR J;GANESH R;VINAY J", "RIVERA-MENDOZA F;ESPINOZA-SILVA PV;FONSECA GM",
"PEREIRA CP;SANTOS R;NUSHI V;LAMEIRO MV;ANTUNES P;CARVALHO R;MAJOR T;ALQAHTANI SJ"
), Author.s..ID = c("57211806807; 57192983097; 36952129500",
"58235484500; 58236398600; 58235265200; 57217734842; 58235484600; 58236171600; 58235043100; 36183085400; 57207799926; 56878742000",
"58235287100; 57205450144; 57189010056", "57215812605; 58519454000; 57924026600; 57924660300; 57829018800; 58519185900",
"57195633934; 6506257821; 6508066300; 6602243600", "58195847100; 15135714000; 58195400400",
"57218218739; 6602870096; 57190741097", "58307513800; 58308177600; 58022245400; 57192995591; 35190337400; 35520201500",
"58484954800; 55185444800", "7401571955; 7402116738; 25651855700; 8552552500; 7101707095; 7201706201; 58303547300; 58569857500; 7003793212",
"57416602300; 6701772603; 57416602400; 58080360400; 58081085000; 36028646100",
"57203321439; 55357687600; 24833021600; 55163807800; 7401492628",
"56727508100; 57218226768; 35932019300; 57203955437; 56668190400; 58081759100; 58082278400; 57840021300; 57189326786; 36839218900",
"58544769500; 58544603300; 46861416800", "57061571400; 8042276500; 6701854111; 51565457300; 26532954500; 56764949400",
"57391780400; 57188842428; 57203335815; 57215374469; 57203317671; 58191839100; 58192302000; 35236150000; 56009909700; 55916628500",
"57194377552; 7003893659; 56539124800; 55318758500; 56441159900; 7004885391",
"55900416400; 57207514529; 57194834663; 6701764799; 57039166200; 58352179300; 6602247508; 57203616204; 58182056400; 56730646000; 58478877100",
"57428699300; 56797610300; 55957033800; 57429757800; 58141532500; 12761531400; 7404487919; 55731667900",
"56898875600; 57224073300; 26640785000", "57195629545", "57205495992; 57217653844; 7005882178; 15061137300",
"58136445000; 57210935896; 57189010056; 57205450144", "58540862700; 25522097000; 58540160600; 57423908600; 57195430892; 58540862800",
"57783544200; 7006925639; 55206855800; 16245238300", "57220773969; 57148260400; 14422375000",
"58074312500; 6504295343; 57204201957; 57268460000; 57901658300",
"56056565700; 6602086533; 57198019376", "58507483800; 56237340500; 7402601773",
"56072719400; 58132576500; 57297746300; 55147892100; 57223444197; 7003946207",
"57904670600; 57904055400; 57904270300; 57904670700; 57904471800; 55517099900",
"57194481538; 16835094000", "7101937964; 35750447700; 55330827100; 7005305192",
"57209341512; 58108459600; 57203920354; 57221468725; 57919847100; 57857535000; 57919671100; 57914942600; 56928003400; 54786023300; 55850166100",
"57700221200", "57539490700; 46861416800; 57540135000; 57195464981; 35190337400",
"57552704200; 57903377000; 58562326200; 57194032731; 57191113816",
"57222986450; 57338272500; 58451580100; 7003454067; 56236335200; 8904910000",
"57209570307; 58453037100", "57325475900; 57465984000; 57396827200",
"56203448200; 57855205700; 57855205800; 57855831000; 57205131975",
"57219023505; 58537713700; 57189588884; 58537055700; 57559779600; 58537055800",
"55433307900; 58186424700; 58070106900; 58185591700; 58185926500; 58186424800; 56979441000",
"55556592000; 57213338080; 58144251500; 58295221000; 58292275300; 57218176879",
"57192662543; 57205450144; 58157816800; 57193712434; 55358169200; 57189010056",
"57221841980; 50261506600; 57221849486; 6504233232; 7202320517; 54917611300",
"12446817300; 6508382225; 57211558771; 57202373100; 6701830546",
"58236897700; 58237381400; 56631932700", "57194795258; 57777793500; 35190337400",
"55433307900; 56979441000; 58185591700; 58145498900; 58145499000; 58145697100; 58145089200; 56450570300"
)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))
我想做的是确定 test2$AU 中有一个来自 1 test2$Author.s. 的不同名称。所以我制作了这个脚本:
library(tidyverse)
library(stringr)
au <- strsplit(test2$AU, ";")
author_ids <- strsplit(test2$Author.s..ID, ";")
max_elements <- max(lengths(au), lengths(author_ids))
au <- lapply(au, function(x) c(x, rep(NA, max_elements - length(x))))
author_ids <- lapply(author_ids, function(x) c(x, rep(NA, max_elements - length(x))))
author_data <- data.frame(
AU = unlist(au, use.names = FALSE),
Author.s..ID = unlist(author_ids, use.names = FALSE)
)
author_data <- na.omit(author_data)
author_data$Author.s..ID <- gsub(" ","", author_data$Author.s..ID)
author_data <- unique(author_data)
mismatches <- author_data %>%
group_by(Author.s..ID) %>%
summarize(AU_count = n_distinct(AU))
mismatches <- mismatches %>%
filter(AU_count > 1)
if (nrow(mismatches) > 0) {
print(mismatches)
} else {
cat("No mismatches or variations found.")
}
然后我做一个 for 循环来替换所有不匹配的东西:
for (i in 1:nrow(mismatches)) {
q <- mismatches[i, 1]
rows <- author_data[which(author_data$Author.s..ID == as.character(q)), ]
for (y in 1:nrow(rows)) {
name_replaced <- rows$AU[2:y]
name_fixed <- rows$AU[1]
for (j in 1:nrow(test2)) {
for (k in 1:length(name_replaced)) {
if (grepl(name_replaced[k], test2$AU[j])) {
test2$AU[j] <- gsub(name_replaced[k], name_fixed, test2$AU[j])
} else {test2$AU[j] <- test2$AU[j]
}
}
}
}
}
我制作的 for 循环创建了一个奇怪的字符串子,它重复了字符,如果我尝试运行循环,我仍然不匹配并且输出仍然不正确。
author_data[which(author_data$Author.s..ID == "56979441000"),]
我哪里做错了?
谢谢!
答:
1赞
jkd
9/24/2023
#1
如果你最终想做的是找到多次归因的 ID 和具有多个 ID 的作者,我建议你首先整理数据,每行有一个作者和作者 ID。然后,查找重复项就变得容易多了:
library(tidyverse)
# tidy up your data
test2 <- test2 |>
mutate(ManuscriptID = row_number(), # I assumed these are articles or manuscripts?
AU = map(AU, \(x) trimws(unlist(strsplit(x,";")))),
Author.s..ID = map(Author.s..ID, \(x) trimws(unlist(strsplit(x,";"))))) |>
unnest(c(AU, Author.s..ID))
# find Authors having multiple IDs
test2 |>
group_by(AU) |>
filter(n()>1) |>
arrange(Author.s..ID)
# deal with Authors having multiple IDs
# ... insert your own code here
# find IDs attributed multiple times
test2 |>
group_by(Author.s..ID) |>
filter(n()>1) |>
arrange(Author.s..ID)
# deal with IDs attributed multiple times
# ... insert your own code here
# restore the previous data structure
test2 |>
group_by(ManuscriptID) |>
summarise(across(everything(), ~ paste(.x, collapse="; ")))
在这两种情况下,我都由您决定如何更改重复的 ID。
1赞
SamR
9/24/2023
#2
你不需要在这里使用。我同意另一个答案,即您想将数据以长格式显示。我就是这样做的:gsub()
author_id <- data.frame(
AU = unlist(strsplit(test2$AU, ";")),
ID = trimws(unlist(strsplit(test2$Author.s..ID, ";"))),
grp = cumsum(sequence(lengths(strsplit(test2$AU, ";"))) == 1)
)
该列稍后将用于将数据放回宽格式。grp
您的问题似乎是将作者姓名的拼写略有不同,并替换为该名称的第一个实例。例如:
filter(author_id, ID == "57189010056")
# AU ID grp
# 1 MÂNICA S 57189010056 3
# 2 MÂNICA S 57189010056 23
# 3 MANICA S 57189010056 45
当您使用 时,您可以非常简单地做到这一点:tidyverse
# Replace the author names with the first instance
author_id <- author_id |>
group_by(ID) |>
mutate(AU = AU[1])
这解决了问题,例如,使用前面的示例:
filter(author_id, ID == "57189010056")
# AU ID grp
# <chr> <chr> <int>
# 1 MÂNICA S 57189010056 3
# 2 MÂNICA S 57189010056 23
# 3 MÂNICA S 57189010056 45
现在我们可以将数据放回宽格式:
# Put the data back in wide form
ids_wide <- author_id |>
split(~grp) |>
lapply(\(df) data.frame(
AU = paste(df$AU, collapse = ";"),
`Author.s..ID` = paste(df$ID, collapse = "; ")
)) |>
bind_rows()
输出:
head(ids_wide)[-2,] # Line 2 is long and wraps terminal
# AU Author.s..ID
# 1 MEISYARA D;GUSWENRIVO I;SINGHAM GV 57211806807; 57192983097; 36952129500
# 3 SEBASTIAN S;FRANCO A;MÂNICA S 58235287100; 57205450144; 57189010056
# 4 HUO D-M;MAO X-Y;MO W-W;ZHAO F-M;DU M;SUN R-R 57215812605; 58519454000; 57924026600; 57924660300; 57829018800; 58519185900
# 5 KIHARA EN;KARANJA SM;WANZALA P;WAGAIYU EG 57195633934; 6506257821; 6508066300; 6602243600
# 6 LIANG CT;HIGGINS D;ASHAR A 58195847100; 15135714000; 58195400400
评论
1赞
Rizky Merdietio
9/24/2023
谢谢!这很简单,但它解决了所有问题。我需要回去学习基础知识..
评论