Thanks for the package.
It offers considerable time benefits.
Below is an example having updated the nysiis_original function.
# install.packages("babynames")
# install.packages("phonics")
library("babynames")
library("phonics")
name <- babynames$name
length(name)
# 1858689
system.time(a <- nysiis_original_perl(name))
# user system elapsed
# 13.36 0.14 13.54
system.time(b <- nysiis(name))
# user system elapsed
# 22.75 0.24 23.02
# All equal?
all.equal(a, b)
# [1] TRUE
# microbenchmark'ing
microbenchmark(
nysiis_original_perl(name),
nysiis(name), times = 25
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# nysiis_original_perl(name) 308.5931 311.0220 316.0347 312.2456 315.8408 345.8459 25
# nysiis(name) 568.2662 573.1073 577.4318 575.4571 577.5975 606.7362 25
sessionInfo()
# R version 3.5.0 (2018-04-23)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
# Running under: Windows 10 x64 (build 17763)
#
# Matrix products: default
#
# locale:
# [1] LC_COLLATE=English_Australia.1252 LC_CTYPE=English_Australia.1252 LC_MONETARY=English_Australia.1252 LC_NUMERIC=C LC_TIME=English_Australia.1252
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] phonics_1.1.0 babynames_0.3.0
#
# loaded via a namespace (and not attached):
# [1] compiler_3.5.0 tools_3.5.0 pillar_1.3.1 tibble_1.4.2 Rcpp_1.0.0 crayon_1.3.4 rlang_0.3.0.1
# nysiis_original with perl = TRUE ...
nysiis_original_perl <- function(word, maxCodeLen = 6) {
## First, remove any nonalphabetical characters and capitalize it
word <- gsub("[^[:alpha:]]*", "", word, perl = TRUE)
word <- toupper(word)
## Translate first characters of name: MAC to MCC, KN to N, K to C, PH,
## PF to FF, SCH to SSS
word <- gsub("^MAC", "MCC", word, perl = TRUE)
word <- gsub("KN", "NN", word, perl = TRUE)
word <- gsub("K", "C", word, perl = TRUE)
word <- gsub("^PF", "FF", word, perl = TRUE)
word <- gsub("PH", "FF", word, perl = TRUE)
word <- gsub("SCH", "SSS", word, perl = TRUE)
## Translate last characters of name: EE to Y, IE to Y, DT, RT, RD,
## NT, ND to D
word <- gsub("EE$", "Y", word, perl = TRUE)
word <- gsub("IE$", "Y", word, perl = TRUE)
word <- gsub("DT$", "D", word, perl = TRUE)
word <- gsub("RT$", "D", word, perl = TRUE)
word <- gsub("RD$", "D", word, perl = TRUE)
word <- gsub("NT$", "D", word, perl = TRUE)
word <- gsub("ND$", "D", word, perl = TRUE)
## First character of key = first character of name.
first <- substr(word, 1, 1)
word <- substr(word, 2, nchar(word))
## EV to AF else A, E, I, O, U to A
word <- gsub("EV", "AF", word, perl = TRUE)
word <- gsub("E|I|O|U", "A", word, perl = TRUE)
## Q to G, Z to S, M to N
word <- gsub("Q", "G", word, perl = TRUE)
word <- gsub("Z", "S", word, perl = TRUE)
word <- gsub("M", "N", word, perl = TRUE)
## KN to N else K to C
## SCH to SSS, PH to FF
## Rules are implemented as part of opening block
## H to If previous or next is non-vowel, previous.
word <- gsub("([^AEIOU])H", "\\1", word, perl = TRUE)
word <- gsub("(.)H[^AEIOU]", "\\1", word, perl = TRUE)
## W to If previous is vowel, A
word <- gsub("([AEIOU])W", "A", word, perl = TRUE)
## If last character is S, remove it
word <- gsub("S$", "", word, perl = TRUE)
## If last characters are AY, replace with Y
word <- gsub("AY$", "Y", word, perl = TRUE)
## Remove duplicate consecutive characters
word <- gsub("([A-Z])\\1+", "\\1", word, perl = TRUE)
## If last character is A, remove it
word <- gsub("A$", "", word, perl = TRUE)
## Append word except for first character to first
word <- paste(first, word, sep = "")
## Truncate to requested length
word <- substr(word, 1, maxCodeLen)
return(word)
}