8000 stringmagic/R/string_tools.R at master · cran/stringmagic · GitHub
[go: up one dir, main page]

Skip to content

Latest commit

 

History

History
2439 lines (2083 loc) · 86.1 KB

File metadata and controls

2439 lines (2083 loc) · 86.1 KB
#----------------------------------------------#
# Author: Laurent Berge
# Date creation: Mon Aug 8 16:26:42 2022
# ~: string tools
#----------------------------------------------#
#' Chains basic operations to character vectors
#'
#' Simple tool to perform multiple operations to character vectors.
#'
#' @inheritParams string_clean
#'
#' @param x A character vector. If not a character vector but atomistic (i.e. not a list),
#' it will be converted to a character vector.
#' @param ... Character **scalars**. Character scalar containing the comma separated values
#' of operations to perform to the vector. The 50+ operations are detailed in the help
#' page of [string_magic()].
#' @param op Character **vector** or `NULL` (default). Character scalar containing the comma separated values
#' of operations to perform to the vector. The 50+ operations are detailed in the help
#' page of [string_magic()]. Note that if this argument is provided, then the values in
#' `...` are ignored.
#' @param pre_unik Logical scalar, default is `NULL`. Whether to first unique the vector
#' before applying the possibly costly string operations, and merging back the result.
#' For very large vectors with repeated values the time gained can be substantial. By
#' default, this is `TRUE` for vector of length 1M or more.
#'
#' @details
#' This function is a simple wrapper around string_magic. Formally, `string_ops(x, "op1, op2")`
#' is equivalent to `string_magic("{op1, op2 ? x}")`.
#'
#' @return
#' In general it returns a character vector. It may be of a length different from the original
#' one, depending on the operations performed.
#'
#' @author
#' Laurent R. Berge
#'
#' @inherit string_clean seealso
#'
#' @family tools with aliases
#'
#' @examples
#'
#' # data on car models
#' cars = row.names(mtcars)
#'
#' # let's get the brands starting with an "m"
#' string_ops(cars, "'i/^m'get, x, unik")
#'
#' # Explainer:
#' # 'i/^m'get: keeps only the elements starting with an m,
#' # i/ is the 'regex-flag' "ignore" to ignore the case
#' # ^m means "starts with an m" in regex language
#' # x: extracts the first pattern. The default pattern is "[[:alnum:]]+"
#' # which means an alpha-numeric word
#' # unik: applies unique() to the vector
#' # => see help in ?string_magic for more details on the operations
#'
#'
#' # let's get the 3 largest numbers appearing in the car models
#' string_ops(cars, "'\\d+'x, rm, unik, num, dsort, 3 first")
#'
#' # Explainer:
#' # '\\d+'x: extracts the first pattern, the pattern meaning "a succession"
#' # of digits in regex language
#' # rm: removes elements equal to the empty string (default behavior)
#' # unik: applies unique() to the vector
#' # num: converts to numeric
#' # dsort: sorts in decreasing order
#' # 3 first: keeps only the first three elements
#'
#' # You can use several character vectors as operations:
#' string_ops(cars,
#' "'\\d+'x, rm, unik",
#' "num, dsort, 3 first")
#'
string_ops = function(x, ..., op = NULL, pre_unik = NULL, namespace = NULL, envir = parent.frame()){
if(missing(x)){
stop("Argument `x` must be provided. PROBLEM: it is currently missing.")
}
if(!is.atomic(x)){
stop("Argument `x` must be atomic. Currently it is of class `", class(x)[1], "`")
}
if(length(x) == 0){
return(x)
}
if(!true_character(x)){
x = as.character(x)
}
set_pblm_hook()
check_character(op, null = TRUE, no_na = TRUE)
if(!is.null(op)){
all_ops = op
} else {
all_ops = check_set_dots(..., mbt = TRUE, scalar = TRUE, character = TRUE)
all_ops = unlist(all_ops)
}
check_logical(pre_unik, null = TRUE, scalar = TRUE)
# For very large vectors, we unique
n = length(x)
if(is.null(pre_unik)){
pre_unik = n > 1e6
}
if(pre_unik){
x_int = to_index(x)
x_small = x[!duplicated(x_int)]
res_small = string_ops(x_small, op = all_ops, pre_unik = FALSE,
namespace = namespace, envir = envir)
res = res_small[x_int]
} else {
if(is.null(namespace)){
namespace = "R_GlobalEnv"
}
user_ops_all = getOption("string_magic_user_ops")
# beware the sneaky assignment!
if(!is.null(user_ops_all) && !is.null(user_info <- user_ops_all[[namespace]])){
.user_funs = user_info$funs
.valid_operators = user_info$operators
} else {
.user_funs = NULL
.valid_operators = getOption("string_magic_operations_default")
}
res = x
for(op in all_ops){
group_flag = 1 * grepl("~", op, fixed = TRUE)
res = apply_simple_operations(res, "string_ops", op, .check = TRUE, .envir = envir,
.data = list(), group_flag = group_flag,
.delim = c("{", "}"), .user_funs = .user_funs,
.valid_operators = .valid_operators)
}
}
if("group_index" %in% names(attributes(res))){
attr(res, "group_index") = NULL
}
res
}
#' Detects whether a pattern is in a character string
#'
#' Function that detects if one or more patterns are in a string. The patterns can be
#' chained, by default this is a regex search but special flags be triggered with a
#' specific syntax, supports negation.
#'
#' @param x A character vector.
#' @param ... Character scalars representing the patterns to be found. By default they are (perl) regular-expressions.
#' Use ' & ' or ' | ' to chain patterns and combine their result logically (ex: `'[[:alpha:]] & \\d'` gets strings
#' containing both letters and numbers). You can negate by adding a `!` first (ex: `"!sepal$"` will
#' return `TRUE` for strings that do not end with `"sepal"`).
#' Add flags with the syntax 'flag1, flag2/pattern'. Available flags are: 'fixed', 'ignore', 'word' and 'magic'.
#' Ex: "ignore/sepal" would get "Sepal.Length" (wouldn't be the case w/t 'ignore').
#' Shortcut: use the first letters of the flags. Ex: "if/dt[" would get `"DT[i = 5]"` (flags 'ignore' + 'fixed').
#' For 'word', it adds word boundaries to the pattern. The `magic` flag first interpolates
#' values directly into the pattern with "{}".
#' @param or Logical, default is `FALSE`. In the presence of two or more patterns,
#' whether to combine them with a logical "or" (the default is to combine them with a logical "and").
#' @param pattern (If provided, elements of `...` are ignored.) A character vector representing the
#' patterns to be found. By default a (perl) regular-expression search is triggered.
#' Use ' & ' or ' | ' to chain patterns and combine their result logically (ex: `'[[:alpha:]] & \\d'` gets strings
#' containing both letters and numbers). You can negate by adding a `!` first (ex: `"!sepal$"` will
#' return `TRUE` for strings that do not end with `"sepal"`).
#' Add flags with the syntax 'flag1, flag2/pattern'. Available flags are: 'fixed', 'ignore', 'word' and 'magic'.
#' Ex: "ignore/sepal" would get "Sepal.Length" (wouldn't be the case w/t 'ignore').
#' Shortcut: use the first letters of the flags. Ex: "if/dt[" would get `"DT[i = 5]"` (flags 'ignore' + 'fixed').
#' For 'word', it adds word boundaries to the pattern. The `magic` flag first interpolates
#' values directly into the pattern with "{}".
#' @param fixed Logical scalar, default is `FALSE`. Whether to trigger a fixed search instead of a
#' regular expression search (default).
#' @param word Logical scalar, default is `FALSE`. If `TRUE` then a) word boundaries are added to the pattern,
#' and b) patterns can be chained by separating them with a comma, they are combined with an OR logical operation.
#' Example: if `word = TRUE`, then pattern = "The, mountain" will select strings containing either the word
#' 'The' or the word 'mountain'.
#' @param ignore.case Logical scalar, default is `FALSE`. If `TRUE`, then case insensitive search is triggered.
#' @param last A function or `NULL` (default). If a function, it will be applied to the vector
#' just before returning it.
#' @param envir Environment in which to evaluate the interpolations if the flag `"magic"` is provided.
#' Default is `parent.frame()`.
#'
#' @details
#' The internal function used to find the patterns is [base::grepl()] with `perl = TRUE`.
#'
#' @section Generic regular expression flags:
#'
#' All `stringmagic` functions support generic flags in regular-expression patterns.
#' The flags are useful to quickly give extra instructions, similarly to *usual*
#' [regular expression flags](https://javascript.info/regexp-introduction).
#'
#' Here the syntax is "flag1, flag2/pattern". That is: flags are a comma separated list of flag-names
#' separated from the pattern with a slash (`/`). Example: `string_which(c("hello...", "world"), "fixed/.")` returns `1`.
#' Here the flag "fixed" removes the regular expression meaning of "." which would have otherwise meant *"any character"*.
#' The no-flag verion `string_which(c("hello...", "world"), ".")` returns `1:2`.
#'
#' Alternatively, and this is recommended, you can collate the initials of the flags instead of using a
#' comma separated list. For example: "if/dt[" will apply the flags "ignore" and "fixed" to the pattern "dt[".
#'
#' The four flags always available are: "ignore", "fixed", "word" and "magic".
#'
#' + "ignore" instructs to ignore the case. Technically, it adds the perl-flag "(?i)"
#' at the beginning of the pattern.
#'
#' + "fixed" removes the regular expression interpretation, so that the characters ".", "$", "^", "["
#' (among others) lose their special meaning and are treated for what they are: simple characters.
#'
#' + "word" adds word boundaries (`"\\b"` in regex language) to the pattern. Further, the comma (`","`)
#' becomes a word separator. Technically, "word/one, two" is treated as "\\b(one|two)\\b". Example:
#' `string_clean("Am I ambushed?", "wi/am")` leads to " I ambushed?" thanks to the flags "ignore" and "word".
#'
#' + "magic" allows to interpolate variables inside the pattern before regex interpretation.
#' For example if `letters = "aiou"` then `string_clean("My great goose!", "magic/[{letters}] => e")`
#' leads to `"My greet geese!"`
#'
#' @return
#' It returns a logical vector of the same length as `x`.
#'
#' The function `string_which` returns a numeric vector.
#'
#' @author
#' Laurent R. Berge
#'
#' @inherit string_clean seealso
#'
#' @examples
#'
#' # NOTA: using `string_get` instead of `string_is` may lead to a faster understanding
#' # of the examples
#'
#' x = string_vec("One, two, one... two, microphone, check")
#'
#' # default is regular expression search
#' # => 3 character items
#' string_is(x, "^...$")
#'
#' # to trigger fixed search use the flag 'fixed'
#' string_is(x, "fixed/...")
#' # you can just use the first letter
#' string_is(x, "f/...")
#'
#' # to negate, use '!' as the first element of the pattern
#' string_is(x, "f/!...")
#'
#' # you can combine several patterns with " & " or " | "
#' string_is(x, "one & c")
#' string_is(x, "one | c")
#'
#' #
#' # word: adds word boundaries
#' #
#'
#' # compare
#' string_is(x, "one")
#' # with
#' string_is(x, "w/one")
#'
#' # words can be chained with commas (it is like an OR logical operation)
#' string_is(x, "w/one, two")
#' # compare with
#' string_is(x, "w/one & two")
#' # remember that you can still negate
#' string_is(x, "w/one & !two")
#'
#' # you can combine the flags
#' # compare
#' string_is(x, "w/one")
#' # with
#' string_is(x, "wi/one")
#'
#' #
#' # the `magic` flag
#' #
#'
#' p = "one"
#' string_is(x, "m/{p}")
#' # Explanation:
#' # - "p" is interpolated into "one"
#' # - we get the equivalent: string_is(x, "one")
#'
#'
#' #
#' # string_which
#' #
#'
#' # it works exactly the same way as string_is
#' # Which are the items containing an 'e' and an 'o'?
#' string_which(x, "e", "o")
#' # equivalently
#' string_which(x, "e & o")
#'
#'
string_is = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,
or = FALSE, pattern = NULL, envir = parent.frame(), last = NULL){
x = check_set_character(x, mbt = TRUE, l0 = TRUE)
if(length(x) == 0){
return(logical(0))
}
check_logical(ignore.case, scalar = TRUE)
check_logical(fixed, scalar = TRUE)
check_logical(word, scalar = TRUE)
check_logical(or, scalar = TRUE)
check_character(pattern, null = TRUE, no_na = TRUE)
check_function(last, null = TRUE)
if(missnull(pattern)){
dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)
if(!is.null(names(dots))){
if(is.null(last)){
warn_no_named_dots(dots)
} else {
# we remove the arguments that go to "last"
dot_names = names(dots)
args_last = names(formals(args(last)))
if(!is.null(args_last)){
args_ok = intersect(dot_names, args_last)
dots[args_ok] = NULL
if(sum(nchar(dot_names) > 0) != length(args_ok)){
warn_no_named_dots(dots, extra_args = args_last, extra_funName = "last")
}
}
}
}
pattern = unlist(dots)
}
or_origin = or
negate = FALSE
logical_op = function(a, b, or) if(or) a | b else a & b
res = NULL
for(i in seq_along(pattern)){
pat = pattern[i]
first_char = substr(pat, 1, 1)
pat_parsed = parse_regex_pattern(pat, c("fixed", "word", "ignore", "magic"),
envir = envir)
is_or = pat_parsed$is_or
flags = pat_parsed$flags
all_patterns = pat_parsed$patterns
negate_all = pat_parsed$is_not
is_fixed_origin = fixed || "fixed" %in% flags
is_word = word || "word" %in% flags
is_ignore = ignore.case || "ignore" %in% flags
res_current = NULL
n_pat = length(all_patterns)
for(j in 1:n_pat){
is_fixed = is_fixed_origin
p = all_patterns[j]
sub_negate = negate_all[j]
p = format_pattern(p, fixed = is_fixed, word = is_word, ignore = is_ignore)
is_fixed = attr(p, "fixed")
res_tmp = tryCatch(grepl(p, x, perl = !is_fixed, fixed = is_fixed),
error = function(e) structure(conditionMessage(e), class = "try-error"),
warning = function(w) structure(conditionMessage(w), class = "try-warning"))
is_warn = inherits(res_tmp, "try-warning")
warn_msg = ""
if(is_warn){
# is there an error?
warn_msg = res_tmp
res_tmp = tryCatch(suppressWarnings(grepl(p, x, perl = !is_fixed, fixed = is_fixed)),
error = function(e) structure(conditionMessage(e), class = "try-error"))
}
if(inherits(res_tmp, "try-error")){
pat_raw = pattern[i]
stopi("CONTEXT: {&p != pat_raw;evaluation of {Q?pat_raw}\n }",
"pattern = {Q?p} ",
"\nEXPECTATION: the pattern must be a valid regular expression",
"\nPROBLEM: `grepl` led to an error, see below:",
"\n{res_tmp}",
"{&nchar(warn_msg) > 0;\n{.}}")
} else if(is_warn){
pat_raw = pattern[i]
warni("CONTEXT: {&p != pat_raw;evaluation of {Q?pat_raw}\n }",
"pattern = {Q?p} ",
"\nA warning was raised when evaluating the pattern:",
"\n{warn_msg}", immediate. = TRUE)
}
if(sub_negate){
res_tmp = !res_tmp
}
if(is.null(res_current)){
res_current = res_tmp
} else {
res_current = logical_op(res_current, res_tmp, is_or[j])
}
}
if(is.null(res)){
res = res_current
} else {
res = logical_op(res, res_current, or)
}
}
if(!is.null(last)){
res = check_set_eval_fun(last, res, ...)
}
res
}
#' @describeIn string_is Detects if at least one element of a vector matches a regex pattern
string_any = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,
or = FALSE, pattern = NULL, envir = parent.frame()){
check_character(pattern, null = TRUE, no_na = TRUE)
if(missnull(pattern)){
dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)
warn_no_named_dots(dots)
pattern = unlist(dots)
}
any(string_is(x, fixed = fixed, ignore.case = ignore.case, word = word,
or = or, pattern = pattern, envir = envir))
}
#' @describeIn string_is Detects if all elements of a vector match a regex pattern
string_all = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,
or = FALSE, pattern = NULL, envir = parent.frame()){
check_character(pattern, null = TRUE, no_na = TRUE)
if(missnull(pattern)){
dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)
warn_no_named_dots(dots)
pattern = unlist(dots)
}
all(string_is(x, fixed = fixed, ignore.case = ignore.case, word = word,
or = or, pattern = pattern, envir = envir))
}
#' @describeIn string_is Returns the indexes of the values in which a pattern is detected
string_which = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,
or = FALSE, pattern = NULL, envir = parent.frame()){
check_character(pattern, null = TRUE, no_na = TRUE)
if(missnull(pattern)){
dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)
warn_no_named_dots(dots)
pattern = unlist(dots)
}
which(string_is(x, fixed = fixed, ignore.case = ignore.case, word = word,
or = or, pattern = pattern, envir = envir))
}
#' Gets elements of a character vector
#'
#' Convenient way to get elements from a character vector.
#'
#' @inheritParams string_is
#'
#' @param x A character vector.
#' @param seq Logical, default is `FALSE`. The argument `pattern` accepts a vector of
#' patterns which are combined with an `and` by default. If `seq = TRUE`, then it is like
#' if `string_get` was called sequentially with its results stacked. See examples.
#' @param seq.unik Logical, default is `FALSE`. The argument `...` (or the argument `pattern`) accepts
#' a vector of patterns which are combined with an `and` by default. If `seq.unik = TRUE`, then
#' `string_get` is called sequentially with its results stacked, and `unique()` is
#' applied in the end. See examples.
#'
#' @details
#' This function is a wrapper to [string_is()].
#'
#' @inheritSection string_is Generic regular expression flags
#'
#' @section Caching:
#'
#' In an exploratory stage, it can be useful to quicky get values from a vector with the
#' least hassle as possible. Hence `string_get` implements caching, so that users do not need
#' to repeat the value of the argument `x` in successive function calls, and can concentrate
#' only on the selection patterns.
#'
#' Caching is a feature only available when the user calls `string_get` from the global environment.
#' If that feature were available in regular code, it would be too dangerous, likely leading to hard to debug bugs.
#' Hence caching is disabled when used within code (i.e. inside a function or inside an
#' automated script), and function calls without the main argument will lead to errors in such scripts.
#'
#' @author
#' Laurent R. Berge
#'
#' @return
#' It always return a character vector.
#'
#' @inherit string_clean seealso
#'
#'
#' @examples
#'
#' x = rownames(mtcars)
#'
#' # find all Mazda cars
#' string_get(x, "Mazda")
#' # same with ignore case flag
#' string_get(x, "i/mazda")
#'
#' # all cars containing a single digit (we use the 'word' flag)
#' string_get(x, "w/\\d")
#'
#' # finds car names without numbers AND containing `u`
#' string_get(x, "!\\d", "u")
#' # equivalently
#' string_get(x, "!\\d & u")
#'
#' # Stacks all Mazda and Volvo cars. Mazda first
#' string_get(x, "Mazda", "Volvo", seq = TRUE)
#'
#' # Stacks all Mazda and Volvo cars. Volvo first
#' string_get(x, "Volvo", "Mazda", seq = TRUE)
#'
#' # let's get the first word of each car name
#' car_first = string_ops(x, "extract.first")
#' # we select car brands ending with 'a', then ending with 'i'
#' string_get(car_first, "a$", "i$", seq = TRUE)
#' # seq.unik is similar to seq but applies unique()
#' string_get(car_first, "a$", "i$", seq.unik = TRUE)
#'
#' #
#' # flags
#' #
#'
#' # you can combine the flags
#' x = string_magic("/One, two, one... Two!, Microphone, check")
#' # regular
#' string_get(x, "one")
#' # ignore case
#' string_get(x, "i/one")
#' # + word boundaries
#' string_get(x, "iw/one")
#'
#' # you can escape the meaning of ! with backslashes
#' string_get(x, "\\!")
#'
#' #
#' # Caching
#' #
#'
#' # Caching is enabled when the function is used interactively
#' # so you don't need to repeat the argument 'x'
#' # Mostly useful at an exploratory stage
#'
#' if(interactive() && is.null(sys.calls())){
#'
#' # first run, the data is cached
#' string_get(row.names(mtcars), "i/vol")
#'
#' # now you don't need to specify the data
#' string_get("i/^m & 4")
#' }
#'
#'
#'
#'
string_get = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,
or = FALSE, seq = FALSE, seq.unik = FALSE,
pattern = NULL, envir = parent.frame()){
x = check_set_character(x, mbt = TRUE, l0 = TRUE)
if(length(x) == 0){
return(character(0))
}
# data caching in interactive mode
is_caching = FALSE
is_forced_caching = isTRUE(getOption("string_magic_string_get_forced_caching"))
if(is_forced_caching || (interactive() && identical(parent.frame(), .GlobalEnv))){
mc = match.call()
if(is.character(mc$x) && !is.null(getOption("stringmagic_string_get_cache"))){
is_caching = TRUE
x_pattern = x
x = getOption("stringmagic_string_get_cache")
} else if(length(x) > 1){
options(stringmagic_string_get_cache = x)
}
}
check_character(pattern, null = TRUE, no_na = TRUE)
check_logical(or, scalar = TRUE)
check_logical(seq, scalar = TRUE)
check_logical(seq.unik, scalar = TRUE)
if(missnull(pattern)){
if(is_caching){
dots = check_set_dots(..., mbt = FALSE, character = TRUE, scalar = TRUE, no_na = TRUE)
dots = append(dots, x_pattern, 0)
} else {
dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)
}
warn_no_ 559E named_dots(dots)
pattern = unlist(dots)
}
if(seq.unik){
seq = TRUE
}
if(seq){
for(i in seq_along(pattern)){
value = string_get(x, pattern = pattern[i], or = or, seq = FALSE,
fixed = fixed, ignore.case = ignore.case,
word = word, envir = envir)
if(i == 1){
res = value
} else {
res = c(res, value)
}
}
if(seq.unik){
res = unique(res)
}
return(res)
}
index = string_is(x, fixed = fixed, ignore.case = ignore.case, word = word,
pattern = pattern, or = or, envir = envir)
x[index]
}
#' Splits a character string wrt a pattern
#'
#' Splits a character string with respect to pattern
#'
#' @inheritParams string_split2df
#'
#' @param x A character vector.
#' @param simplify Logical scalar, default is `TRUE`. If `TRUE`, then when the vector input `x`
#' is of length 1, a character vector is returned instead of a list.
#'
#' @inheritSection string_is Generic regular expression flags
#'
#' @return
#' If `simplify = TRUE` (default), the object returned is:
#' + a character vector if `x`, the vector in input, is of length 1: the character vector contains
#' the result of the split.
#' + a list of the same length as `x`. The ith element of the list is a character vector
#' containing the result of the split of the ith element of `x`.
#'
#' If `simplify = FALSE`, the object returned is always a list.
#'
#' @examples
#'
#' time = "This is the year 2024."
#'
#' # we break the sentence
#' string_split(time, " ")
#'
#' # simplify = FALSE leads to a list
#' string_split(time, " ", simplify = FALSE)
#'
#' # let's break at "is"
#' string_split(time, "is")
#'
#' # now breaking at the word "is"
#' # NOTE: we use the flag `word` (`w/`)
#' string_split(time, "w/is")
#'
#' # same but using a pattern from a variable
#' # NOTE: we use the `magic` flag
#' pat = "is"
#' string_split(time, "mw/{pat}")
#'
#'
string_split = function(x, split, simplify = TRUE, fixed = FALSE,
ignore.case = FALSE, word = FALSE,
envir = parent.frame()){
x = check_set_character(x, mbt = TRUE, l0 = TRUE)
check_logical(simplify, scalar = TRUE)
check_logical(fixed, scalar = TRUE)
check_logical(ignore.case, scalar = TRUE)
check_logical(word, scalar = TRUE)
pat_parsed = format_simple_regex_flags(split, ignore = ignore.case,
fixed = fixed, word = word,
magic = TRUE, envir = envir)
split = pat_parsed$pattern
is_fixed = pat_parsed$fixed
x_split = strsplit(x, split, fixed = is_fixed, perl = !is_fixed)
if(simplify && length(x) == 1){
x_split = x_split[[1]]
}
x_split
}
#' Splits a character vector into a data frame
#'
#' Splits a character vector and formats the resulting substrings into a data.frame
#'
#' @inheritParams string_is
#'
#' @param x A character vector or a two-sided formula. If a two-sided formula, then the
#' argument `data` must be provided since the variables will be fetched in there.
#' A formula is of the form `char_var ~ id1 + id2` where `char_var` on the left is a
#' character variable and on the right `id1` and `id2` are identifiers which will be
#' included in the resulting table. Alternatively, you can provide identifiers via
#' the argument `id`.
#' @param data Optional, only used if the argument `x` is a formula. It should
#' contain the variables of the formula.
#' @param split A character scalar. Used to split the character vectors. By default
#' this is a regular expression. You can use flags in the pattern in the form `flag1, flag2/pattern`.
#' Available flags are `ignore` (case), `fixed` (no regex), word (add word boundaries),
#' magic (add interpolation with `"{}"`). Example:
#' if "ignore/hello" and the text contains "Hello", it will be split at "Hello".
#' Shortcut: use the first letters of the flags. Ex: "iw/one" will split at the word
#' "one" (flags 'ignore' + 'word').
#' @param id Optional. A character vector or a list of vectors. If provided, the
#' values of `id` are considered as identifiers that will be included in the resulting table.
#' @param add.pos Logical, default is `FALSE`. Whether to include the position of each split element.
#' @param id_unik Logical, default is `TRUE`. In the case identifiers are provided,
#' whether to trigger a message if the identifiers are not unique. Indeed, if
#' the identifiers are not unique, it is not possible to reconstruct the original texts.
#' @param fixed Logical, default is `FALSE`. Whether to consider the argument `split`
#' as fixed (and not as a regular expression).
#' @param dt Logical, default is `FALSE`. Whether to return a `data.table`. See also the function `string_split2dt`.
#' @param ... Not currently used.
#'
#' @return
#' It returns a `data.frame` or a `data.table` which will contain: i) `obs`: the observation index,
#' ii) `pos`: the position of the text element in the initial string (optional, via add.pos),
#' iii) the text element, iv) the identifier(s) (optional, only if `id` was provided).
#'
#' @inherit string_clean seealso
#'
#' @examples
#'
#' x = c("Nor rain, wind, thunder, fire are my daughters.",
#' "When my information changes, I alter my conclusions.")
#'
#' id = c("ws", "jmk")
#'
#' # we split at each word
#' string_split2df(x, "[[:punct:] ]+")
#'
#' # we add the 'id'
#' string_split2df(x, "[[:punct:] ]+", id = id)
#'
#' # TO NOTE:
#' # - the second argument is `data`
#' # - when it is missing, the argument `split` becomes implicitly the second
#' # - ex: above we did not use `split = "[[:punct:] ]+"`
#'
#' #
#' # using the formula
#'
#' base = data.frame(text = x, my_id = id)
#' string_split2df(text ~ my_id, base, "[[:punct:] ]+")
#'
#' #
#' # with 2+ identifiers
#'
#' base = within(mtcars, carname <- rownames(mtcars))
#'
#' # we have a message because the identifiers are not unique
#' string_split2df(carname ~ am + gear + carb, base, " +")
#'
#' # adding the position of the words & removing the message
#' string_split2df(carname ~ am + gear + carb, base, " +", id_unik = FALSE, add.pos = TRUE)
#'
#'
string_split2df = function(x, data = NULL, split = NULL, id = NULL, add.pos = FALSE,
id_unik = TRUE, fixed = FALSE, ignore.case = FALSE,
word = FALSE, envir = parent.frame(), dt = FALSE, ...){
if(missing(x)){
stop("Argument 'x' must be provied. PROBLEM: it is missing.")
}
if(missnull(data) && missnull(split)){
stop("Argument `split` must be provided. PROBLEM: it is missing.")
}
if(!missnull(data) && is.character(data) && length(data) == 1 && missnull(split)){
# argument sliding
split = data
data = NULL
}
#
# Formatting the input data
#
# used for the names
dots = list(...)
internal = FALSE
if("mc" %in% names(dots)){
mc = dots$mc
internal = TRUE
} else {
mc = match.call()
}
id_names = NULL
x_name = NULL
if(inherits(x, "formula")){
fml = x
if(!missnull(data) && !is.list(data)){
stop("When `x` is a formula, the argument `data` must be a list containing ",
"the variables in the formula.",
"\nPROBLEM: `data` is not a list.")
}
if(length(fml) < 3){
stop("When `x` is a formula, it must be two sided (on the left the variable,",
" on the right the identifier).\nPROBLEM: it is currently only one sided.")
}
x = try(eval(fml[[2]], data, enclos = parent.frame()), silent = TRUE)
# error handling
if(inherits(x, "try-error") || !is.atomic(x)){
vars = all.vars(fml[[2]])
if(!is.null(names(data))){
var_pblm = setdiff(vars, names(data))
if(length(var_pblm) > 0){
stop("The evaluation of the left side of `x` raised an error:\n",
string_magic("PROBLEM: the variable{$s, enum.bq, is ? var_pblm} not in the data set (`"),
deparse_short(mc$data), "`).")
}
}
if(inherits(x, "try-error")){
stop("The evaluation of the left side of `x` raised an error:\n", x)
}
stop("The evaluation of the left side of `x` raised an error:\n",
"PROBLEM: `x` is not a character vector, instead it is of class ", class(x)[1])
}
#
# elements from id
# we don't apply terms: we only separate with sum
terms_right = fml_extract_elements(fml)
id = list()
for(i in seq_along(terms_right)){
term = terms_right[[i]]
id_name = deparse_short(term)
val = try(eval(term, data, enclos = parent.frame()), silent = TRUE)
# error handling
if(inherits(val, "try-error") || !is.atomic(val)){
vars = all.vars(term)
intro = string_magic("The evaluation of the right side of `x` raised an error.\n",
"VALUE TO EVAL: {bq ? id_name}\n")
if(!is.null(names(data))){
var_pblm = setdiff(vars, names(data))
if(length(var_pblm) > 0){
stop(intro,
string_magic("PROBLEM: the variable{$s, enum.bq, is ? var_pblm} not in the data set (`"),
deparse_short(mc$data), "`).")
}
}
if(inherits(val, "try-error")){
stop(intro, "ERROR: ", val)
}
stop(intro,
"PROBLEM: this identifier is not atomic, instead it is of class ", class(x)[1])
}
id[[id_name]] = val
}
if(!is.character(x)){
x = as.character(x)
}
x_name = deparse_short(fml[[2]])
id_names = names(id)
} else {
check_set_character(x)
x_name = "x"
}
n = length(x)
if(!missnull(id)){
# id: a) a vector, b) a data frame. Same length as x
if(is.list(id)){
n_id = unique(lengths(id))
if(length(n_id) != 1 || max(n_id) != n){
extra = ""
if(max(n_id) != n) extra = string_magic("\nPROBELM: len x: {#n ? n} len id: {#n ? max(n_id)}.")
stop("The argument `id` must be either a vector of identifiers or a data.frame ",
"of identifiers of the same length as `x`.", extra)
}
id_names = names(id)
if(is.null(id_names)){
id_txt = deparse_short(mc$id)
if(length(id_names) > 1){
id_names = paste0(id_txt, "..", 1:length(id))
} else {
id_names = id_txt
}
}
} else {
n_id = length(id)
if(n != n_id){
stopi("The argument `id` must be either a vector of identifiers or a data.frame ",
"of identifiers of the same length as `x`.",
"\nPROBELM: len x: {n ? n}; len id: {n ? n_id}.")
}
if(!is.atomic(id)){
stop("The argument `id` must be either a vector of identifiers or a data.frame ",
"of identifiers of the same length as `x`.",
"\nPROBLEM: `id` is not an atomic vector")
}
id_names = deparse_short(mc$id)
id = list(id)
}
}
add.id = FALSE
obs = 1:n
if(!missnull(id)){
add.id = TRUE
id_raw = id
id = to_index(list = id)
if(id_unik && max(id) != length(id)){
message("The identifiers are not unique, you will not be able to reconstruct the data using only them.")
}
}
# flags
pat_parsed = format_simple_regex_flags(split, ignore = ignore.case,
fixed = fixed, word = word,
magic = TRUE, envir = parent.frame(1 + internal))
split = pat_parsed$pattern
is_fixed = pat_parsed$fixed
x_split = strsplit(x, split, fixed = is_fixed, perl = !is_fixed)
n_all = lengths(x_split)
# the result table
if(add.pos){
obs_all = rep(obs, n_all)
pos = cpp_create_pos(obs_all)
res = data.frame(obs = obs_all, pos = pos, x = unlist(x_split),
row.names = NULL)
} else {
res = data.frame(obs = rep(obs, n_all), x = unlist(x_split),
row.names = NULL)
}
0