Regular expression examples

Regular expression examples

(subject.size.vec <- unique(as.integer(10^seq(0,3.5,l=100))))
#>  [1]    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
#> [16]   17   18   20   22   23   25   28   30   33   35   38   42   45   49   53
#> [31]   58   63   68   74   81   87   95  103  112  121  132  143  155  168  183
#> [46]  198  215  233  253  275  298  323  351  380  413  448  486  527  572  620
#> [61]  673  730  792  859  932 1011 1097 1190 1291 1401 1519 1648 1788 1940 2104
#> [76] 2283 2477 2687 2915 3162
(backtrackers <- c(
  if(requireNamespace("stringi"))atime::atime_grid(
    ICU=stringi::stri_match(subject, regex=pattern)),
  atime::atime_grid(
    PCRE=regexpr(pattern, subject, perl=TRUE),
    TRE=regexpr(pattern, subject, perl=FALSE))))
#> Le chargement a nécessité le package : stringi
#> $ICU
#> stringi::stri_match(subject, regex = pattern)
#> 
#> $PCRE
#> regexpr(pattern, subject, perl = TRUE)
#> 
#> $TRE
#> regexpr(pattern, subject, perl = FALSE)
backtrackers.result <- atime::atime(
  N=subject.size.vec,
  setup={
    subject <- paste(rep("a", N), collapse="")
    pattern <- paste(rep(c("(a)?", "\\1"), each=N), collapse="")
  },
  expr.list=backtrackers)
backtrackers.best <- atime::references_best(backtrackers.result)
plot(backtrackers.best)
#> Warning: Transformation introduced infinite values in continuous y-axis

plot of chunk unnamed-chunk-1

The plot above shows that ICU/PCRE/TRE are all exponential in N (subject/pattern size) when the pattern contains backreferences.

all.exprs <- c(
  if(requireNamespace("re2"))atime::atime_grid(
    RE2=re2::re2_match(subject, pattern)),
  backtrackers)
#> Le chargement a nécessité le package : re2
all.result <- atime::atime(
  N=subject.size.vec,
  setup={
    subject <- paste(rep("a", N), collapse="")
    pattern <- paste(rep(c("a?", "a"), each=N), collapse="")
  },
  expr.list=all.exprs)
all.best <- atime::references_best(all.result)
plot(all.best)
#> Warning: Transformation introduced infinite values in continuous y-axis

plot of chunk unnamed-chunk-2

The plot above shows that ICU/PCRE are exponential time whereas RE2/TRE are polynomial time. Exercise for the reader: modify the above code to use the seconds.limit argument so that you can see what happens to ICU/PCRE for larger N (hint: you should see a difference at larger sizes).

Interpolate at seconds.limit using predict method

(all.pred <- predict(all.best))
#> atime_prediction object
#>       unit expr.name unit.value         N
#>     <char>    <char>      <num>     <num>
#> 1: seconds       RE2       0.01 268.81060
#> 2: seconds       ICU       0.01  16.04799
#> 3: seconds      PCRE       0.01  16.46585
#> 4: seconds       TRE       0.01  74.74111
summary(all.pred)
#>               Length Class      Mode   
#> seconds.limit  1     -none-     numeric
#> references    16     data.table list   
#> measurements  23     data.table list   
#> prediction     4     data.table list

The predict method above returns a list with a new element named prediction, which shows the data sizes that can be computed with a given time budget. The plot method is used below,

plot(all.pred)
#> Warning: Transformation introduced infinite values in continuous x-axis

plot of chunk unnamed-chunk-4

atime_grid to compare different engines

In the nc package there is an engine argument which controls which C regex library is used:

nc.exprs <- atime::atime_grid(
  list(ENGINE=c(
    if(requireNamespace("re2"))"RE2",
    "PCRE",
    if(requireNamespace("stringi"))"ICU")),
  nc=nc::capture_first_vec(subject, pattern, engine=ENGINE))
nc.result <- atime::atime(
  N=subject.size.vec,
  setup={
    rep.collapse <- function(chr)paste(rep(chr, N), collapse="")
    subject <- rep.collapse("a")
    pattern <- list(maybe=rep.collapse("a?"), rep.collapse("a"))
  },
  expr.list=nc.exprs)
nc.best <- atime::references_best(nc.result)
plot(nc.best)

plot of chunk unnamed-chunk-5

The result/plot above is consistent with the previous result.