2022-11-27

Double Loops in R: Use .name_repair to specify repair?

I have this dataset in R:

set.seed(123)

myFun <- function(n = 5000) {
  a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
  paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}

col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)

example = data.frame(col1, col2, col3, col4, group)

       col1       col2       col3       col4 group
1 SKZDZ9876D BTAMF8110T LIBFV6882H ZFIPL4295E     A
2 NXJRX7189Y AIZGY5809C HSMIH4556D YJGJP8022H     C
3 XPTZB2035P EEKXK0873A PCPNW1021S NMROS4134O     A
4 LJMCM3436S KGADK2847O SRMUI5723N RDIXI7301N     B
5 ADITC6567L HUOCT5660P AQCNE3753K FUMGY1428B     D
6 BAEDP8491P IAGQG4816B TXXQH6337M SDACH5752D     C

I wrote this loop that compares different string distance metrics between all combinations of (col1,col2) and (col3,col4):

method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")

library(stringdist)

results = list()

for (i in 1:length(method))

{

method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
 name_2_i = paste0("col3_col_4", method_i)

p1_i = stringdistmatrix(col1, col2, method =  method_i, useNames = "string") %>%
            as_tibble(rownames = "a") %>%
            pivot_longer(-1, names_to = "b", values_to = name_1_i)

p2_i = stringdistmatrix(col3, col4, method =  method_i, useNames = "string") %>%
            as_tibble(rownames = "a") %>%
            pivot_longer(-1, names_to = "b", values_to = name_2_i)

p1_i = p1_i[,3]
p2_i = p2_i[,3]

final_i = cbind(p1_i, p2_i)

results[[i]] = final_i
}

final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)

average_col1_col2_dist = (final$col1_col_2osa  + final$col1_col_2lv + final$col1_col_2dl      + final$col1_col_2hamming + final$col1_col_2lcs +     final$col1_col_2qgram  + final$col1_col_2cosine    + final$col1_col_2jaccard + final$col1_col_2jw   + final$col1_col_2soundex)/10

 average_col3_col4_dist =  ( final$col3_col_4osa     +    final$col3_col_4lv       +     final$col3_col_4dl  +     final$col3_col_4hamming +  final$col3_col_4lcs +  final$col3_col_4qgram  +   final$col3_col_4cosine +    final$col3_col_4jaccard  +    final$col3_col_4jw     +   final$col3_col_4soundex)/10

final = data.frame( col1, col2, col3, col4, average_col1_col2_dist,  average_col3_col4_dist)
final = scale(final)

Now, I would like to make this a "double loop" and have the same comparisons being done, but the comparisons should be made only within each "group" :

results = list()


for (i in 1:length(method))
for (j in 1:length(unique(example$group))

{

{

groups_j = unique(example$group[j])
my_data_i = file[which(file$fsa == groups_j  ), ]


method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
 name_2_i = paste0("col3_col_4", method_i)

p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method =  method_i, useNames = "string") %>%
            as_tibble(rownames = "a") %>%
            pivot_longer(-1, names_to = "b", values_to = name_1_i)

p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method =  method_i, useNames = "string") %>%
            as_tibble(rownames = "a") %>%
            pivot_longer(-1, names_to = "b", values_to = name_2_i)

p1_i = p1_i[,3]
p2_i = p2_i[,3]

final_i = cbind(p1_i, p2_i)
 results[[i]] = final_i

}
   
}

final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)

average_col1_col2_dist = (final$col1_col_2osa  + final$col1_col_2lv + final$col1_col_2dl      + final$col1_col_2hamming + final$col1_col_2lcs +     final$col1_col_2qgram  + final$col1_col_2cosine    + final$col1_col_2jaccard + final$col1_col_2jw   + final$col1_col_2soundex)/10

 average_col3_col4_dist =  ( final$col3_col_4osa     +    final$col3_col_4lv       +     final$col3_col_4dl  +     final$col3_col_4hamming +  final$col3_col_4lcs +  final$col3_col_4qgram  +   final$col3_col_4cosine +    final$col3_col_4jaccard  +    final$col3_col_4jw     +   final$col3_col_4soundex)/10

final = data.frame( col1, col2, col3, col4, average_col1_col2_dist,  average_col3_col4_dist)
final = scale(final)

But I keep getting this error:

Error:
! Column 1 must be named.
Use .name_repair to specify repair.
Caused by error in `repaired_names()`:
! Names can't be empty.
x Empty name found at location 1.

Does anyone know how I can fix this?

Thank you!



No comments:

Post a Comment