提问人:stats_noob 提问时间:11/24/2022 更新时间:11/27/2022 访问量:196
R 中的双循环:使用 .name_repair 指定修复?
Double Loops in R: Use .name_repair to specify repair?
问:
我在 R 中有这个数据集:
set.seed(123)
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)
example = data.frame(col1, col2, col3, col4, group)
col1 col2 col3 col4 group
1 SKZDZ9876D BTAMF8110T LIBFV6882H ZFIPL4295E A
2 NXJRX7189Y AIZGY5809C HSMIH4556D YJGJP8022H C
3 XPTZB2035P EEKXK0873A PCPNW1021S NMROS4134O A
4 LJMCM3436S KGADK2847O SRMUI5723N RDIXI7301N B
5 ADITC6567L HUOCT5660P AQCNE3753K FUMGY1428B D
6 BAEDP8491P IAGQG4816B TXXQH6337M SDACH5752D C
我编写了这个循环,比较了 (col1,col2) 和 (col3,col4) 的所有组合之间的不同字符串距离度量:
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")
library(stringdist)
results = list()
for (i in 1:length(method))
{
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(col1, col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(col3, col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
results[[i]] = final_i
}
final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)
average_col1_col2_dist = (final$col1_col_2osa + final$col1_col_2lv + final$col1_col_2dl + final$col1_col_2hamming + final$col1_col_2lcs + final$col1_col_2qgram + final$col1_col_2cosine + final$col1_col_2jaccard + final$col1_col_2jw + final$col1_col_2soundex)/10
average_col3_col4_dist = ( final$col3_col_4osa + final$col3_col_4lv + final$col3_col_4dl + final$col3_col_4hamming + final$col3_col_4lcs + final$col3_col_4qgram + final$col3_col_4cosine + final$col3_col_4jaccard + final$col3_col_4jw + final$col3_col_4soundex)/10
final = data.frame( col1, col2, col3, col4, average_col1_col2_dist, average_col3_col4_dist)
final = scale(final)
现在,我想把它变成一个“双循环”,并进行相同的比较,但比较应该只在每个“组”内进行:
results = list()
for (i in 1:length(method))
for (j in 1:length(unique(example$group))
{
{
groups_j = unique(example$group[j])
my_data_i = file[which(file$fsa == groups_j ), ]
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
results[[i]] = final_i
}
}
final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)
average_col1_col2_dist = (final$col1_col_2osa + final$col1_col_2lv + final$col1_col_2dl + final$col1_col_2hamming + final$col1_col_2lcs + final$col1_col_2qgram + final$col1_col_2cosine + final$col1_col_2jaccard + final$col1_col_2jw + final$col1_col_2soundex)/10
average_col3_col4_dist = ( final$col3_col_4osa + final$col3_col_4lv + final$col3_col_4dl + final$col3_col_4hamming + final$col3_col_4lcs + final$col3_col_4qgram + final$col3_col_4cosine + final$col3_col_4jaccard + final$col3_col_4jw + final$col3_col_4soundex)/10
final = data.frame( col1, col2, col3, col4, average_col1_col2_dist, average_col3_col4_dist)
final = scale(final)
但是我不断收到此错误:
Error:
! Column 1 must be named.
Use .name_repair to specify repair.
Caused by error in `repaired_names()`:
! Names can't be empty.
x Empty name found at location 1.
有谁知道我该如何解决这个问题?
谢谢!
答:
在试图理解你在做什么的过程中,我偏离了你的原始代码。其中大部分并不一定有什么问题!
您的代码
至于您的分组代码...
你从
for(this in that)
for(this in that)
{
{
括号嵌套了 for 语句中的内容。你需要
for(this in that) {
for(this in that) {
# or this works
for(this in that)
{
for(this in that)
{
当您指定条件时,您选择了整数。但是,您可以直接使用字符串,如for
for(i in method) { # i is a string
# versus
for(i in 1:length(method)) { # i is an integer
当你写嵌套语句时,你错过了一个右括号。for
for(j in 1:length(unique(example$group)) # end parentheses missing!
# should have been
for(j in 1:length(unique(example$group)))
# easier to see like this:
for(j in 1:length(
unique(
example$group
)
)
)
您知道吗?可以将 RStudio 设置为使用“彩虹括号”,这非常适合确保不会错过右括号或括号。转到“工具”->“全局选项”->代码(弹出窗口中的左侧菜单)->显示(菜单弹出窗口中的顶部菜单)&“彩虹括号”是列表中的最后一项。这是我当前外观设置的样子:
提取组时,您选择的是数据集行,而不是唯一值。
# this selects jth row, then looks for unique values
groups_j = unique(example$group[j])
# you need to get the unique values, then iterate
group_j = unique(example$group)[j]
# the j goes outside the call for unique()
在此代码中,您编写了 和 .我假设这等价于 和 ,因为我没有 中的任何内容。file
file$fsa
example
example$group
file
所有这些代码行都做同样的事情。请记住,它位于数据框中,但它也是环境中的一个对象。group
my_data_i = example[which(example$group == group_j), ] # this would work
my_data_i <- filter(example, groups == group_j) # this would work
my_data_i <- example[group == group_j, ] # this would work
my_data_i <- example[example$group == group_j, ] # this would work
如果迭代了组而不是索引,则可以跳过创建,这是您唯一使用的 。group_j
j
for(j in unique(example$group)) {
my_data_i <- example[example$group == j, ]
}
当您单独使用 和 发送结果时,每次迭代都会覆盖数据。i
i
j
第一个组迭代可以进入,但下一个组迭代可以绑定到该数据,也可以放在列表中的列表中。results[[i]]
group
例如:
results[[1]] <- group 1, method 1
results[[1]] <- rbind(results[[1]], [group 2, method 1])
# or
results[[1]][[1]] <- group 1, method 1
results[[1]][[2]] <- group 2, method 1
考虑到列表(上面)的这两个选项,第一个选项将允许您的剩余代码(创建最终代码、平均值等)在不进行任何更改的情况下工作。但是,如果使用第二个选项(如上所述),则需要修改该代码。
如果你离开这个会起作用:for(j
1:length
if(j < 2) {
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
如果您使用 ,则可以使用以下命令:for(j in unique(example$group))
if(isTRUE(j == unique(example$group)[1])) { # isTRUE() to avoid null errors
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
嵌套语句全部位于一个块中。for
results = list()
for (i in 1:length(method)) { # bracket missing here; it was in the wrong place
for (j in 1:length(unique(example$group))) { # missing a parentheses here
# { # this needs to be after each for statement
# groups_j = unique(example$group[j]) # you have selected the jth row, not the jth unique
group_j = unique(example$group)[j] # the selection goes outside the call for unique()
# use things like print or message to check what your function does
# print(group_j)
# message('this is a message ', group_j) # notice the different color in the console?
my_data_i <- example[group == group_j, ] # this would work
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
# results[[i]] = final_i # you replace this content everytime you change groups
# you need to append the values between groups (assuming you want one column per test type)
# first append, then combine
if(j < 2) { # use < instead of == to avoid null error
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
}
}
我的代码来完成相同的任务
我添加了一些检查和平衡,使其更具动态性。您可以将任意数量的列、方法或组发送到 。grpComp
这使用 、 和 。tidyverse
glue
stringdist
第一个函数由另一个函数调用。
library(tidyverse)
library(stringdist)
library(glue)
strD <- function(c1, c2, mm) { # input column 1; column 2; measurement method
res <- stringdistmatrix(c1, c2, method = mm, useNames = 'string')
f_res <- matrix(res) # extract values and flatten
}
这是按组函数划分的距离。
grpComp <- function(fr, methods, grp) { # data frame of columns to compare,
# methods to use, groups (vector same length as df rows)
cnames <- names(fr)
if(length(cnames) %% 2 != 0) {
message('there are an uneven number of columns to compare')
break # something's wrong
}
if(length(grp) != nrow(fr)) {
message('there groups vector length must match number of rows in the data')
break # something's wrong
}
# extract distances
dists <- map(
method,
function(j) {
str_ds <- map_dfc( # by column sets
seq(from = 1, to = length(cnames), by = 2),
function(i) {
str_gr <- map_dfr( # by group
unique(grp),
function(k) {
as.data.frame(list( # has to be list for `col.names` to work
strD(fr[grp == k, cnames[i]],
fr[grp == k, cnames[i + 1]], j)),
optional = F, row.names = NULL,
col.names = paste0("c", i, i+1, '_', j))
}) # combine groups by rows
str_gr
}) # combine methods by columns
str_ds
}) %>% do.call(cbind, .)
ncnames <- names(dists) %>% substr(1, 3) %>% unique() # determine unique col groups
for(m in ncnames) { # get averages for each comparison set
dists <- mutate(dists,
"ave_{m}" := rowMeans(across(contains(m))) %>% scale())
}
dists <- select(dists, contains('ave'))
}
这就是使用此代码的方式。
test5 <- grpComp(example[, 1:4], methods, example$group)
即使您的非分组数据函数正在工作,我想我也会包含该代码。
strComp <- function(fr, methods) { # data frame of columns to compare, methods to use
cnames <- names(fr)
if(length(cnames) %% 2 != 0) {
message('there are an uneven number of columns to compare')
break # something's wrong
}
# extract distances
dists <- map(
method,
function(j) {
str_ds <- map_dfc(
seq(from = 1, to = length(cnames), by = 2),
function(i) {
as.data.frame(list( # has to be list for `col.names` to work
strD(fr[, cnames[i]], fr[, cnames[i + 1]], j)), optional = F,
col.names = paste0("c", i, i+1, '_', j))
})
str_ds
}) %>% do.call(cbind, .)
ncnames <- names(dists) %>% substr(1, 3) %>% unique() # determine unique col groups
for(k in ncnames) { # get averages for each comparison set
dists <- mutate(dists,
"ave_{k}" := rowMeans(across(contains(k))) %>% scale())
}
dists <- select(dists, contains('ave'))
}
要使用此功能,请执行以下操作:
test4 <- strComp(example[, 1:4], methods)
评论
group_map
example %>% group_by(group) %>% group_map(myInnerLoopFun)
for