stringmagic/R/string_tools.R at master · cran/stringmagic

History

2439 lines (2083 loc) · 86.1 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

#----------------------------------------------#

# Author: Laurent Berge

# Date creation: Mon Aug 8 16:26:42 2022

# ~: string tools

#----------------------------------------------#

#' Chains basic operations to character vectors

#' Simple tool to perform multiple operations to character vectors.

#' @inheritParams string_clean

#' @param x A character vector. If not a character vector but atomistic (i.e. not a list),

#' it will be converted to a character vector.

#' @param ... Character **scalars**. Character scalar containing the comma separated values

#' of operations to perform to the vector. The 50+ operations are detailed in the help

#' page of [string_magic()].

#' @param op Character **vector** or `NULL` (default). Character scalar containing the comma separated values

#' of operations to perform to the vector. The 50+ operations are detailed in the help

#' page of [string_magic()]. Note that if this argument is provided, then the values in

#' `...` are ignored.

#' @param pre_unik Logical scalar, default is `NULL`. Whether to first unique the vector

#' before applying the possibly costly string operations, and merging back the result.

#' For very large vectors with repeated values the time gained can be substantial. By

#' default, this is `TRUE` for vector of length 1M or more.

#' @details

#' This function is a simple wrapper around string_magic. Formally, `string_ops(x, "op1, op2")`

#' is equivalent to `string_magic("{op1, op2 ? x}")`.

#' @return

#' In general it returns a character vector. It may be of a length different from the original

#' one, depending on the operations performed.

#' @author

#' Laurent R. Berge

#' @inherit string_clean seealso

#' @family tools with aliases

#' @examples

#' # data on car models

#' cars = row.names(mtcars)

#' # let's get the brands starting with an "m"

#' string_ops(cars, "'i/^m'get, x, unik")

#' # Explainer:

#' # 'i/^m'get: keeps only the elements starting with an m,

#' # i/ is the 'regex-flag' "ignore" to ignore the case

#' # ^m means "starts with an m" in regex language

#' # x: extracts the first pattern. The default pattern is "[[:alnum:]]+"

#' # which means an alpha-numeric word

#' # unik: applies unique() to the vector

#' # => see help in ?string_magic for more details on the operations

#' # let's get the 3 largest numbers appearing in the car models

#' string_ops(cars, "'\\d+'x, rm, unik, num, dsort, 3 first")

#' # Explainer:

#' # '\\d+'x: extracts the first pattern, the pattern meaning "a succession"

#' # of digits in regex language

#' # rm: removes elements equal to the empty string (default behavior)

#' # unik: applies unique() to the vector

#' # num: converts to numeric

#' # dsort: sorts in decreasing order

#' # 3 first: keeps only the first three elements

#' # You can use several character vectors as operations:

#' string_ops(cars,

#' "'\\d+'x, rm, unik",

#' "num, dsort, 3 first")

string_ops = function(x, ..., op = NULL, pre_unik = NULL, namespace = NULL, envir = parent.frame()){

if(missing(x)){

stop("Argument `x` must be provided. PROBLEM: it is currently missing.")

}

if(!is.atomic(x)){

stop("Argument `x` must be atomic. Currently it is of class `", class(x)[1], "`")

}

if(length(x) == 0){

return(x)

}

if(!true_character(x)){

x = as.character(x)

}

set_pblm_hook()

check_character(op, null = TRUE, no_na = TRUE)

if(!is.null(op)){

all_ops = op

} else {

all_ops = check_set_dots(..., mbt = TRUE, scalar = TRUE, character = TRUE)

all_ops = unlist(all_ops)

}

check_logical(pre_unik, null = TRUE, scalar = TRUE)

# For very large vectors, we unique

n = length(x)

if(is.null(pre_unik)){

pre_unik = n > 1e6

}

if(pre_unik){

x_int = to_index(x)

x_small = x[!duplicated(x_int)]

res_small = string_ops(x_small, op = all_ops, pre_unik = FALSE,

namespace = namespace, envir = envir)

res = res_small[x_int]

} else {

if(is.null(namespace)){

namespace = "R_GlobalEnv"

}

user_ops_all = getOption("string_magic_user_ops")

# beware the sneaky assignment!

if(!is.null(user_ops_all) && !is.null(user_info <- user_ops_all[[namespace]])){

.user_funs = user_info$funs

.valid_operators = user_info$operators

} else {

.user_funs = NULL

.valid_operators = getOption("string_magic_operations_default")

}

res = x

for(op in all_ops){

group_flag = 1 * grepl("~", op, fixed = TRUE)

res = apply_simple_operations(res, "string_ops", op, .check = TRUE, .envir = envir,

.data = list(), group_flag = group_flag,

.delim = c("{", "}"), .user_funs = .user_funs,

.valid_operators = .valid_operators)

}

if("group_index" %in% names(attributes(res))){

attr(res, "group_index") = NULL

}

res

}

#' Detects whether a pattern is in a character string

#' Function that detects if one or more patterns are in a string. The patterns can be

#' chained, by default this is a regex search but special flags be triggered with a

#' specific syntax, supports negation.

#' @param x A character vector.

#' @param ... Character scalars representing the patterns to be found. By default they are (perl) regular-expressions.

#' Use ' & ' or ' | ' to chain patterns and combine their result logically (ex: `'[[:alpha:]] & \\d'` gets strings

#' containing both letters and numbers). You can negate by adding a `!` first (ex: `"!sepal$"` will

#' return `TRUE` for strings that do not end with `"sepal"`).

#' Add flags with the syntax 'flag1, flag2/pattern'. Available flags are: 'fixed', 'ignore', 'word' and 'magic'.

#' Ex: "ignore/sepal" would get "Sepal.Length" (wouldn't be the case w/t 'ignore').

#' Shortcut: use the first letters of the flags. Ex: "if/dt[" would get `"DT[i = 5]"` (flags 'ignore' + 'fixed').

#' For 'word', it adds word boundaries to the pattern. The `magic` flag first interpolates

#' values directly into the pattern with "{}".

#' @param or Logical, default is `FALSE`. In the presence of two or more patterns,

#' whether to combine them with a logical "or" (the default is to combine them with a logical "and").

#' @param pattern (If provided, elements of `...` are ignored.) A character vector representing the

#' patterns to be found. By default a (perl) regular-expression search is triggered.

#' Use ' & ' or ' | ' to chain patterns and combine their result logically (ex: `'[[:alpha:]] & \\d'` gets strings

#' containing both letters and numbers). You can negate by adding a `!` first (ex: `"!sepal$"` will

#' return `TRUE` for strings that do not end with `"sepal"`).

#' Add flags with the syntax 'flag1, flag2/pattern'. Available flags are: 'fixed', 'ignore', 'word' and 'magic'.

#' Ex: "ignore/sepal" would get "Sepal.Length" (wouldn't be the case w/t 'ignore').

#' Shortcut: use the first letters of the flags. Ex: "if/dt[" would get `"DT[i = 5]"` (flags 'ignore' + 'fixed').

#' For 'word', it adds word boundaries to the pattern. The `magic` flag first interpolates

#' values directly into the pattern with "{}".

#' @param fixed Logical scalar, default is `FALSE`. Whether to trigger a fixed search instead of a

#' regular expression search (default).

#' @param word Logical scalar, default is `FALSE`. If `TRUE` then a) word boundaries are added to the pattern,

#' and b) patterns can be chained by separating them with a comma, they are combined with an OR logical operation.

#' Example: if `word = TRUE`, then pattern = "The, mountain" will select strings containing either the word

#' 'The' or the word 'mountain'.

#' @param ignore.case Logical scalar, default is `FALSE`. If `TRUE`, then case insensitive search is triggered.

#' @param last A function or `NULL` (default). If a function, it will be applied to the vector

#' just before returning it.

#' @param envir Environment in which to evaluate the interpolations if the flag `"magic"` is provided.

#' Default is `parent.frame()`.

#' @details

#' The internal function used to find the patterns is [base::grepl()] with `perl = TRUE`.

#' @section Generic regular expression flags:

#' All `stringmagic` functions support generic flags in regular-expression patterns.

#' The flags are useful to quickly give extra instructions, similarly to *usual*

#' [regular expression flags](https://javascript.info/regexp-introduction).

#' Here the syntax is "flag1, flag2/pattern". That is: flags are a comma separated list of flag-names

#' separated from the pattern with a slash (`/`). Example: `string_which(c("hello...", "world"), "fixed/.")` returns `1`.

#' Here the flag "fixed" removes the regular expression meaning of "." which would have otherwise meant *"any character"*.

#' The no-flag verion `string_which(c("hello...", "world"), ".")` returns `1:2`.

#' Alternatively, and this is recommended, you can collate the initials of the flags instead of using a

#' comma separated list. For example: "if/dt[" will apply the flags "ignore" and "fixed" to the pattern "dt[".

#' The four flags always available are: "ignore", "fixed", "word" and "magic".

#' + "ignore" instructs to ignore the case. Technically, it adds the perl-flag "(?i)"

#' at the beginning of the pattern.

#' + "fixed" removes the regular expression interpretation, so that the characters ".", "$", "^", "["

#' (among others) lose their special meaning and are treated for what they are: simple characters.

#' + "word" adds word boundaries (`"\\b"` in regex language) to the pattern. Further, the comma (`","`)

#' becomes a word separator. Technically, "word/one, two" is treated as "\\b(one|two)\\b". Example:

#' `string_clean("Am I ambushed?", "wi/am")` leads to " I ambushed?" thanks to the flags "ignore" and "word".

#' + "magic" allows to interpolate variables inside the pattern before regex interpretation.

#' For example if `letters = "aiou"` then `string_clean("My great goose!", "magic/[{letters}] => e")`

#' leads to `"My greet geese!"`

#' @return

#' It returns a logical vector of the same length as `x`.

#' The function `string_which` returns a numeric vector.

#' @author

#' Laurent R. Berge

#' @inherit string_clean seealso

#' @examples

#' # NOTA: using `string_get` instead of `string_is` may lead to a faster understanding

#' # of the examples

#' x = string_vec("One, two, one... two, microphone, check")

#' # default is regular expression search

#' # => 3 character items

#' string_is(x, "^...$")

#' # to trigger fixed search use the flag 'fixed'

#' string_is(x, "fixed/...")

#' # you can just use the first letter

#' string_is(x, "f/...")

#' # to negate, use '!' as the first element of the pattern

#' string_is(x, "f/!...")

#' # you can combine several patterns with " & " or " | "

#' string_is(x, "one & c")

#' string_is(x, "one | c")

#' #

#' # word: adds word boundaries

#' #

#' # compare

#' string_is(x, "one")

#' # with

#' string_is(x, "w/one")

#' # words can be chained with commas (it is like an OR logical operation)

#' string_is(x, "w/one, two")

#' # compare with

#' string_is(x, "w/one & two")

#' # remember that you can still negate

#' string_is(x, "w/one & !two")

#' # you can combine the flags

#' # compare

#' string_is(x, "w/one")

#' # with

#' string_is(x, "wi/one")

#' #

#' # the `magic` flag

#' #

#' p = "one"

#' string_is(x, "m/{p}")

#' # Explanation:

#' # - "p" is interpolated into "one"

#' # - we get the equivalent: string_is(x, "one")

#' #

#' # string_which

#' #

#' # it works exactly the same way as string_is

#' # Which are the items containing an 'e' and an 'o'?

#' string_which(x, "e", "o")

#' # equivalently

#' string_which(x, "e & o")

string_is = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,

or = FALSE, pattern = NULL, envir = parent.frame(), last = NULL){

x = check_set_character(x, mbt = TRUE, l0 = TRUE)

if(length(x) == 0){

return(logical(0))

}

check_logical(ignore.case, scalar = TRUE)

check_logical(fixed, scalar = TRUE)

check_logical(word, scalar = TRUE)

check_logical(or, scalar = TRUE)

check_character(pattern, null = TRUE, no_na = TRUE)

check_function(last, null = TRUE)

if(missnull(pattern)){

dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)

if(!is.null(names(dots))){

if(is.null(last)){

warn_no_named_dots(dots)

} else {

# we remove the arguments that go to "last"

dot_names = names(dots)

args_last = names(formals(args(last)))

if(!is.null(args_last)){

args_ok = intersect(dot_names, args_last)

dots[args_ok] = NULL

if(sum(nchar(dot_names) > 0) != length(args_ok)){

warn_no_named_dots(dots, extra_args = args_last, extra_funName = "last")

}

pattern = unlist(dots)

}

or_origin = or

negate = FALSE

logical_op = function(a, b, or) if(or) a | b else a & b

res = NULL

for(i in seq_along(pattern)){

pat = pattern[i]

first_char = substr(pat, 1, 1)

pat_parsed = parse_regex_pattern(pat, c("fixed", "word", "ignore", "magic"),

envir = envir)

is_or = pat_parsed$is_or

flags = pat_parsed$flags

all_patterns = pat_parsed$patterns

negate_all = pat_parsed$is_not

is_fixed_origin = fixed || "fixed" %in% flags

is_word = word || "word" %in% flags

is_ignore = ignore.case || "ignore" %in% flags

res_current = NULL

n_pat = length(all_patterns)

for(j in 1:n_pat){

is_fixed = is_fixed_origin

p = all_patterns[j]

sub_negate = negate_all[j]

p = format_pattern(p, fixed = is_fixed, word = is_word, ignore = is_ignore)

is_fixed = attr(p, "fixed")

res_tmp = tryCatch(grepl(p, x, perl = !is_fixed, fixed = is_fixed),

error = function(e) structure(conditionMessage(e), class = "try-error"),

warning = function(w) structure(conditionMessage(w), class = "try-warning"))

is_warn = inherits(res_tmp, "try-warning")

warn_msg = ""

if(is_warn){

# is there an error?

warn_msg = res_tmp

res_tmp = tryCatch(suppressWarnings(grepl(p, x, perl = !is_fixed, fixed = is_fixed)),

error = function(e) structure(conditionMessage(e), class = "try-error"))

}

if(inherits(res_tmp, "try-error")){

pat_raw = pattern[i]

stopi("CONTEXT: {&p != pat_raw;evaluation of {Q?pat_raw}\n }",

"pattern = {Q?p} ",

"\nEXPECTATION: the pattern must be a valid regular expression",

"\nPROBLEM: `grepl` led to an error, see below:",

"\n{res_tmp}",

"{&nchar(warn_msg) > 0;\n{.}}")

} else if(is_warn){

pat_raw = pattern[i]

warni("CONTEXT: {&p != pat_raw;evaluation of {Q?pat_raw}\n }",

"pattern = {Q?p} ",

"\nA warning was raised when evaluating the pattern:",

"\n{warn_msg}", immediate. = TRUE)

}

if(sub_negate){

res_tmp = !res_tmp

}

if(is.null(res_current)){

res_current = res_tmp

} else {

res_current = logical_op(res_current, res_tmp, is_or[j])

}

if(is.null(res)){

res = res_current

} else {

res = logical_op(res, res_current, or)

}

if(!is.null(last)){

res = check_set_eval_fun(last, res, ...)

}

res

}

#' @describeIn string_is Detects if at least one element of a vector matches a regex pattern

string_any = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,

or = FALSE, pattern = NULL, envir = parent.frame()){

check_character(pattern, null = TRUE, no_na = TRUE)

if(missnull(pattern)){

dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)

warn_no_named_dots(dots)

pattern = unlist(dots)

}

any(string_is(x, fixed = fixed, ignore.case = ignore.case, word = word,

or = or, pattern = pattern, envir = envir))

}

#' @describeIn string_is Detects if all elements of a vector match a regex pattern

string_all = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,

or = FALSE, pattern = NULL, envir = parent.frame()){

check_character(pattern, null = TRUE, no_na = TRUE)

if(missnull(pattern)){

dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)

warn_no_named_dots(dots)

pattern = unlist(dots)

}

all(string_is(x, fixed = fixed, ignore.case = ignore.case, word = word,

or = or, pattern = pattern, envir = envir))

}

#' @describeIn string_is Returns the indexes of the values in which a pattern is detected

string_which = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,

or = FALSE, pattern = NULL, envir = parent.frame()){

check_character(pattern, null = TRUE, no_na = TRUE)

if(missnull(pattern)){

dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)

warn_no_named_dots(dots)

pattern = unlist(dots)

}

which(string_is(x, fixed = fixed, ignore.case = ignore.case, word = word,

or = or, pattern = pattern, envir = envir))

}

#' Gets elements of a character vector

#' Convenient way to get elements from a character vector.

#' @inheritParams string_is

#' @param x A character vector.

#' @param seq Logical, default is `FALSE`. The argument `pattern` accepts a vector of

#' patterns which are combined with an `and` by default. If `seq = TRUE`, then it is like

#' if `string_get` was called sequentially with its results stacked. See examples.

#' @param seq.unik Logical, default is `FALSE`. The argument `...` (or the argument `pattern`) accepts

#' a vector of patterns which are combined with an `and` by default. If `seq.unik = TRUE`, then

#' `string_get` is called sequentially with its results stacked, and `unique()` is

#' applied in the end. See examples.

#' @details

#' This function is a wrapper to [string_is()].

#' @inheritSection string_is Generic regular expression flags

#' @section Caching:

#' In an exploratory stage, it can be useful to quicky get values from a vector with the

#' least hassle as possible. Hence `string_get` implements caching, so that users do not need

#' to repeat the value of the argument `x` in successive function calls, and can concentrate

#' only on the selection patterns.

#' Caching is a feature only available when the user calls `string_get` from the global environment.

#' If that feature were available in regular code, it would be too dangerous, likely leading to hard to debug bugs.

#' Hence caching is disabled when used within code (i.e. inside a function or inside an

#' automated script), and function calls without the main argument will lead to errors in such scripts.

#' @author

#' Laurent R. Berge

#' @return

#' It always return a character vector.

#' @inherit string_clean seealso

#' @examples

#' x = rownames(mtcars)

#' # find all Mazda cars

#' string_get(x, "Mazda")

#' # same with ignore case flag

#' string_get(x, "i/mazda")

#' # all cars containing a single digit (we use the 'word' flag)

#' string_get(x, "w/\\d")

#' # finds car names without numbers AND containing `u`

#' string_get(x, "!\\d", "u")

#' # equivalently

#' string_get(x, "!\\d & u")

#' # Stacks all Mazda and Volvo cars. Mazda first

#' string_get(x, "Mazda", "Volvo", seq = TRUE)

#' # Stacks all Mazda and Volvo cars. Volvo first

#' string_get(x, "Volvo", "Mazda", seq = TRUE)

#' # let's get the first word of each car name

#' car_first = string_ops(x, "extract.first")

#' # we select car brands ending with 'a', then ending with 'i'

#' string_get(car_first, "a$", "i$", seq = TRUE)

#' # seq.unik is similar to seq but applies unique()

#' string_get(car_first, "a$", "i$", seq.unik = TRUE)

#' #

#' # flags

#' #

#' # you can combine the flags

#' x = string_magic("/One, two, one... Two!, Microphone, check")

#' # regular

#' string_get(x, "one")

#' # ignore case

#' string_get(x, "i/one")

#' # + word boundaries

#' string_get(x, "iw/one")

#' # you can escape the meaning of ! with backslashes

#' string_get(x, "\\!")

#' #

#' # Caching

#' #

#' # Caching is enabled when the function is used interactively

#' # so you don't need to repeat the argument 'x'

#' # Mostly useful at an exploratory stage

#' if(interactive() && is.null(sys.calls())){

#' # first run, the data is cached

#' string_get(row.names(mtcars), "i/vol")

#' # now you don't need to specify the data

#' string_get("i/^m & 4")

#' }

string_get = function(x, ..., fixed = FALSE, ignore.case = FALSE, word = FALSE,

or = FALSE, seq = FALSE, seq.unik = FALSE,

pattern = NULL, envir = parent.frame()){

x = check_set_character(x, mbt = TRUE, l0 = TRUE)

if(length(x) == 0){

return(character(0))

}

# data caching in interactive mode

is_caching = FALSE

is_forced_caching = isTRUE(getOption("string_magic_string_get_forced_caching"))

if(is_forced_caching || (interactive() && identical(parent.frame(), .GlobalEnv))){

mc = match.call()

if(is.character(mc$x) && !is.null(getOption("stringmagic_string_get_cache"))){

is_caching = TRUE

x_pattern = x

x = getOption("stringmagic_string_get_cache")

} else if(length(x) > 1){

options(stringmagic_string_get_cache = x)

}

check_character(pattern, null = TRUE, no_na = TRUE)

check_logical(or, scalar = TRUE)

check_logical(seq, scalar = TRUE)

check_logical(seq.unik, scalar = TRUE)

if(missnull(pattern)){

if(is_caching){

dots = check_set_dots(..., mbt = FALSE, character = TRUE, scalar = TRUE, no_na = TRUE)

dots = append(dots, x_pattern, 0)

} else {

dots = check_set_dots(..., mbt = TRUE, character = TRUE, scalar = TRUE, no_na = TRUE)

}

warn_no_ 559E named_dots(dots)

pattern = unlist(dots)

}

if(seq.unik){

seq = TRUE

}

if(seq){

for(i in seq_along(pattern)){

value = string_get(x, pattern = pattern[i], or = or, seq = FALSE,

fixed = fixed, ignore.case = ignore.case,

word = word, envir = envir)

if(i == 1){

res = value

} else {

res = c(res, value)

}

if(seq.unik){

res = unique(res)

}

return(res)

}

index = string_is(x, fixed = fixed, ignore.case = ignore.case, word = word,

pattern = pattern, or = or, envir = envir)

x[index]

}

#' Splits a character string wrt a pattern

#' Splits a character string with respect to pattern

#' @inheritParams string_split2df

#' @param x A character vector.

#' @param simplify Logical scalar, default is `TRUE`. If `TRUE`, then when the vector input `x`

#' is of length 1, a character vector is returned instead of a list.

#' @inheritSection string_is Generic regular expression flags

#' @return

#' If `simplify = TRUE` (default), the object returned is:

#' + a character vector if `x`, the vector in input, is of length 1: the character vector contains

#' the result of the split.

#' + a list of the same length as `x`. The ith element of the list is a character vector

#' containing the result of the split of the ith element of `x`.

#' If `simplify = FALSE`, the object returned is always a list.

#' @examples

#' time = "This is the year 2024."

#' # we break the sentence

#' string_split(time, " ")

#' # simplify = FALSE leads to a list

#' string_split(time, " ", simplify = FALSE)

#' # let's break at "is"

#' string_split(time, "is")

#' # now breaking at the word "is"

#' # NOTE: we use the flag `word` (`w/`)

#' string_split(time, "w/is")

#' # same but using a pattern from a variable

#' # NOTE: we use the `magic` flag

#' pat = "is"

#' string_split(time, "mw/{pat}")

string_split = function(x, split, simplify = TRUE, fixed = FALSE,

ignore.case = FALSE, word = FALSE,

envir = parent.frame()){

x = check_set_character(x, mbt = TRUE, l0 = TRUE)

check_logical(simplify, scalar = TRUE)

check_logical(fixed, scalar = TRUE)

check_logical(ignore.case, scalar = TRUE)

check_logical(word, scalar = TRUE)

pat_parsed = format_simple_regex_flags(split, ignore = ignore.case,

fixed = fixed, word = word,

magic = TRUE, envir = envir)

split = pat_parsed$pattern

is_fixed = pat_parsed$fixed

x_split = strsplit(x, split, fixed = is_fixed, perl = !is_fixed)

if(simplify && length(x) == 1){

x_split = x_split[[1]]

}

x_split

}

#' Splits a character vector into a data frame

#' Splits a character vector and formats the resulting substrings into a data.frame

#' @inheritParams string_is

#' @param x A character vector or a two-sided formula. If a two-sided formula, then the

#' argument `data` must be provided since the variables will be fetched in there.

#' A formula is of the form `char_var ~ id1 + id2` where `char_var` on the left is a

#' character variable and on the right `id1` and `id2` are identifiers which will be

#' included in the resulting table. Alternatively, you can provide identifiers via

#' the argument `id`.

#' @param data Optional, only used if the argument `x` is a formula. It should

#' contain the variables of the formula.

#' @param split A character scalar. Used to split the character vectors. By default

#' this is a regular expression. You can use flags in the pattern in the form `flag1, flag2/pattern`.

#' Available flags are `ignore` (case), `fixed` (no regex), word (add word boundaries),

#' magic (add interpolation with `"{}"`). Example:

#' if "ignore/hello" and the text contains "Hello", it will be split at "Hello".

#' Shortcut: use the first letters of the flags. Ex: "iw/one" will split at the word

#' "one" (flags 'ignore' + 'word').

#' @param id Optional. A character vector or a list of vectors. If provided, the

#' values of `id` are considered as identifiers that will be included in the resulting table.

#' @param add.pos Logical, default is `FALSE`. Whether to include the position of each split element.

#' @param id_unik Logical, default is `TRUE`. In the case identifiers are provided,

#' whether to trigger a message if the identifiers are not unique. Indeed, if

#' the identifiers are not unique, it is not possible to reconstruct the original texts.

#' @param fixed Logical, default is `FALSE`. Whether to consider the argument `split`

#' as fixed (and not as a regular expression).

#' @param dt Logical, default is `FALSE`. Whether to return a `data.table`. See also the function `string_split2dt`.

#' @param ... Not currently used.

#' @return

#' It returns a `data.frame` or a `data.table` which will contain: i) `obs`: the observation index,

#' ii) `pos`: the position of the text element in the initial string (optional, via add.pos),

#' iii) the text element, iv) the identifier(s) (optional, only if `id` was provided).

#' @inherit string_clean seealso

#' @examples

#' x = c("Nor rain, wind, thunder, fire are my daughters.",

#' "When my information changes, I alter my conclusions.")

#' id = c("ws", "jmk")

#' # we split at each word

#' string_split2df(x, "[[:punct:] ]+")

#' # we add the 'id'

#' string_split2df(x, "[[:punct:] ]+", id = id)

#' # TO NOTE:

#' # - the second argument is `data`

#' # - when it is missing, the argument `split` becomes implicitly the second

#' # - ex: above we did not use `split = "[[:punct:] ]+"`

#' #

#' # using the formula

#' base = data.frame(text = x, my_id = id)

#' string_split2df(text ~ my_id, base, "[[:punct:] ]+")

#' #

#' # with 2+ identifiers

#' base = within(mtcars, carname <- rownames(mtcars))

#' # we have a message because the identifiers are not unique

#' string_split2df(carname ~ am + gear + carb, base, " +")

#' # adding the position of the words & removing the message

#' string_split2df(carname ~ am + gear + carb, base, " +", id_unik = FALSE, add.pos = TRUE)

string_split2df = function(x, data = NULL, split = NULL, id = NULL, add.pos = FALSE,

id_unik = TRUE, fixed = FALSE, ignore.case = FALSE,

word = FALSE, envir = parent.frame(), dt = FALSE, ...){

if(missing(x)){

stop("Argument 'x' must be provied. PROBLEM: it is missing.")

}

if(missnull(data) && missnull(split)){

stop("Argument `split` must be provided. PROBLEM: it is missing.")

}

if(!missnull(data) && is.character(data) && length(data) == 1 && missnull(split)){

# argument sliding

split = data

data = NULL

}

# Formatting the input data

# used for the names

dots = list(...)

internal = FALSE

if("mc" %in% names(dots)){

mc = dots$mc

internal = TRUE

} else {

mc = match.call()

}

id_names = NULL

x_name = NULL

if(inherits(x, "formula")){

fml = x

if(!missnull(data) && !is.list(data)){

stop("When `x` is a formula, the argument `data` must be a list containing ",

"the variables in the formula.",

"\nPROBLEM: `data` is not a list.")

}

if(length(fml) < 3){

stop("When `x` is a formula, it must be two sided (on the left the variable,",

" on the right the identifier).\nPROBLEM: it is currently only one sided.")

}

x = try(eval(fml[[2]], data, enclos = parent.frame()), silent = TRUE)

# error handling

if(inherits(x, "try-error") || !is.atomic(x)){

vars = all.vars(fml[[2]])

if(!is.null(names(data))){

var_pblm = setdiff(vars, names(data))

if(length(var_pblm) > 0){

stop("The evaluation of the left side of `x` raised an error:\n",

string_magic("PROBLEM: the variable{$s, enum.bq, is ? var_pblm} not in the data set (`"),

deparse_short(mc$data), "`).")

}

if(inherits(x, "try-error")){

stop("The evaluation of the left side of `x` raised an error:\n", x)

}

stop("The evaluation of the left side of `x` raised an error:\n",

"PROBLEM: `x` is not a character vector, instead it is of class ", class(x)[1])

}

# elements from id

# we don't apply terms: we only separate with sum

terms_right = fml_extract_elements(fml)

id = list()

for(i in seq_along(terms_right)){

term = terms_right[[i]]

id_name = deparse_short(term)

val = try(eval(term, data, enclos = parent.frame()), silent = TRUE)

# error handling

if(inherits(val, "try-error") || !is.atomic(val)){

vars = all.vars(term)

intro = string_magic("The evaluation of the right side of `x` raised an error.\n",

"VALUE TO EVAL: {bq ? id_name}\n")

if(!is.null(names(data))){

var_pblm = setdiff(vars, names(data))

if(length(var_pblm) > 0){

stop(intro,

string_magic("PROBLEM: the variable{$s, enum.bq, is ? var_pblm} not in the data set (`"),

deparse_short(mc$data), "`).")

}

if(inherits(val, "try-error")){

stop(intro, "ERROR: ", val)

}

stop(intro,

"PROBLEM: this identifier is not atomic, instead it is of class ", class(x)[1])

}

id[[id_name]] = val

}

if(!is.character(x)){

x = as.character(x)

}

x_name = deparse_short(fml[[2]])

id_names = names(id)

} else {

check_set_character(x)

x_name = "x"

}

n = length(x)

if(!missnull(id)){

# id: a) a vector, b) a data frame. Same length as x

if(is.list(id)){

n_id = unique(lengths(id))

if(length(n_id) != 1 || max(n_id) != n){

extra = ""

if(max(n_id) != n) extra = string_magic("\nPROBELM: len x: {#n ? n} len id: {#n ? max(n_id)}.")

stop("The argument `id` must be either a vector of identifiers or a data.frame ",

"of identifiers of the same length as `x`.", extra)

}

id_names = names(id)

if(is.null(id_names)){

id_txt = deparse_short(mc$id)

if(length(id_names) > 1){

id_names = paste0(id_txt, "..", 1:length(id))

} else {

id_names = id_txt

}

} else {

n_id = length(id)

if(n != n_id){

stopi("The argument `id` must be either a vector of identifiers or a data.frame ",

"of identifiers of the same length as `x`.",

"\nPROBELM: len x: {n ? n}; len id: {n ? n_id}.")

}

if(!is.atomic(id)){

stop("The argument `id` must be either a vector of identifiers or a data.frame ",

"of identifiers of the same length as `x`.",

"\nPROBLEM: `id` is not an atomic vector")

}

id_names = deparse_short(mc$id)

id = list(id)

}

add.id = FALSE

obs = 1:n

if(!missnull(id)){

add.id = TRUE

id_raw = id

id = to_index(list = id)

if(id_unik && max(id) != length(id)){

message("The identifiers are not unique, you will not be able to reconstruct the data using only them.")

}

# flags

pat_parsed = format_simple_regex_flags(split, ignore = ignore.case,

fixed = fixed, word = word,

magic = TRUE, envir = parent.frame(1 + internal))

split = pat_parsed$pattern

is_fixed = pat_parsed$fixed

x_split = strsplit(x, split, fixed = is_fixed, perl = !is_fixed)

n_all = lengths(x_split)

# the result table

if(add.pos){

obs_all = rep(obs, n_all)

pos = cpp_create_pos(obs_all)

res = data.frame(obs = obs_all, pos = pos, x = unlist(x_split),

row.names = NULL)

} else {

res = data.frame(obs = rep(obs, n_all), x = unlist(x_split),

row.names = NULL)

}

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

string_tools.R

Latest commit

History

string_tools.R

File metadata and controls