Remove part of a string based on overlapping patterns

New option with str_locate_all mentionned by Uwe in a comment under the question which greatly simplify the code:

library(stringr)
# Create function to remove matching part of text
# First argument is text, second argument is a list of start and length 
remove_matching_parts <- function(text, positions) {
  if (nrow(positions) == 0) return(text)
  ret <- strsplit(text,"")[[1]]
  lapply(1:nrow(positions), function(x) { ret[ positions[x,1]:positions[x,2] ] <<- NA } )
  paste0(ret[!is.na(ret)],separator="",collapse="")
}

# Loop over the data to apply the pattern
# row = length of vector, columns = length of pattern
matches <- lapply(dat$x, function(x) {
  do.call(rbind,str_locate_all(x, my_patterns)) # transform the list output of str_locate in a table of start/end
})

# Avoid growing a vector in a for loop, create it beforehand, it will be the same length as teh vector we work against
dat$result <- vector("character",length(dat$x))
# Loop on each value to remove the matching parts
for (i in 1:length(dat$x)) {
 dat$result[i] <- remove_matching_parts(as.character(dat$x[i]),matches[[i]])
}

If you have control over the pattern definition and can create it by hand then it can be achieved with a regex solution:

> gsub("(is )?my (other text|example)?","",dat$x)
[1] "this  text"        "and here  example" " is short" 

The idea is to create the pattern with optional parts (the ? after the grouping parentheses.

So we have roughly:

  • (is )? <= optional "is" followed by space
  • my <= literal "my" followed by space
  • (other text|example)? <= Optional text after "my ", either "other text" or (the |) "example"

If you don't have control, things gets messy, I hope I've commented enough for it to be understandable, according to the number of loops included don't expect it to be quick:

# Given datas
dat <- data.frame(x               = c("this is my example text", "and here is my other text example", "my other text is short","yet another text"),
                some_other_cols = c(1, 2, 2, 4))

my_patterns <- c("my example", "is my", "my other text")

# Create function to remove matching part of text
# First argument is text, second argument is a list of start and length 
remove_matching_parts <- function(text, positions) {
  ret <- strsplit(text,"")[[1]]
  lapply(positions, function(x) { ifelse(is.na(x),,ret[ x[1]:x[2] ] <<- NA ) } )
  paste0(ret[!is.na(ret)],separator="",collapse="")
}

# Create the matches between a vector and a pattern
# First argument is the pattern to match, second is the vector of charcaters
match_pat_to_vector <- function(pattern,vector) {
  sapply(regexec(pattern,vector), 
         function(x) {
           if(x>-1) { 
             c(start=as.numeric(x), end=as.numeric(x+attr(x,"match.length")) ) # Create a start/end vector from the index and length of the match
           }
         })
}

# Loop over the patterns to create a dataframe of matches
# row = length of vector, columns = length of pattern
matches <- sapply(my_patterns,match_pat_to_vector,vector=dat$x)

# Avoid growing a vector in a for loop, create it beforehand, it will be the same length as teh vector we work against
dat$result <- vector("character",length(dat$x))
# Loop on each value to remove the matching parts
for (i in 1:length(dat$x)) {
 dat$result[i] <- remove_matching_parts(as.character(dat$x[i]),matches[i,])
}

Result after run:

> dat
                                  x some_other_cols           result
1           this is my example text               1        this text
2 and here is my other text example               2 and here example
3            my other text is short               2         is short
4                  yet another text               4 yet another text

There are two crucial points here:

  1. The patterns to remove from a string may overlap
  2. There may be multiple non-overlapping patterns to remove from the string

The solution below tries to address both issues using my favorite tools

library(data.table)
setDT(dat)[, rn := .I] # add row numbers to join on later

library(stringr)
library(magrittr) # piping used to improve readability

pos <- 
  # find start and end positions for each pattern
  lapply(my_patterns, function(pat) str_locate_all(dat$x, pat) %>% 
           lapply(as.data.table) %>% 
           rbindlist(idcol = "rn")) %>% 
  rbindlist() %>% 
  # collapse overlapping positions
  setorder(rn, start, end) %>% 
  .[, grp := cumsum(cummax(shift(end, fill = 0)) < start), by = rn] %>% 
  .[, .(start = min(start), end = max(end)), by = .(rn, grp)]

Now, pos has become:

    rn grp start end
 1:  1   1     6  18
 2:  2   1    10  25
 3:  3   1     1  13
 4:  5   1     6  10
 5:  5   2    24  28
 6:  6   1     1  13
 7:  6   2    15  27
 8:  7   1     3   7
 9:  8   1     1  10
10:  8   2    12  16
11:  8   3    22  34
12:  9   1     1  10
13:  9   2    19  31
# remove patterns from strings from back to front
dat[, short_x := x]
for (g in rev(seq_len(max(pos$grp)))) {
  # update join 
  dat[pos[grp == g], on = .(rn), short_x := `str_sub<-`(short_x, start, end, value = "")]
}
dat[, rn := NULL][   #remove row number
  , short_x := str_squish(short_x)][]   # remove whitespace 
                                             x some_other_cols                          short_x
1:                     this is my example text               1                        this text
2:           and here is my other text example               2                 and here example
3:                      my other text is short               2                         is short
4:                            yet another text               4                 yet another text
5: this is my text where 'is my' appears twice               5 this text where '' appears twice
6:                 my other text is my example               6                                 
7:                                 This myself               7                           Thself
8:          my example is my not my other text               8                              not
9:             my example is not my other text               9                           is not

The code to collapse overlapping positions is modified from this answer.

The intermediate result

lapply(my_patterns, function(pat) str_locate_all(dat$x, pat) %>% 
           lapply(as.data.table) %>% 
           rbindlist(idcol = "rn"))
[[1]]
   rn start end
1:  1     9  18
2:  6    18  27
3:  8     1  10
4:  9     1  10

[[2]]
   rn start end
1:  1     6  10
2:  2    10  14
3:  5     6  10
4:  5    24  28
5:  6    15  19
6:  7     3   7
7:  8    12  16

[[3]]
   rn start end
1:  2    13  25
2:  3     1  13
3:  6     1  13
4:  8    22  34
5:  9    19  31

shows that patterns 1 and 2 overlap in row 1 and patterns 2 and 3 overlap in row 2. Rows 5, 8, and 9 have non-overlapping patterns. Row 7 is to show that patterns are extracted regardless of word boundaries.

EDIT: dplyr version

The OP has mentioned that he/she has "successfully avoided data.table so far". So, I felt challenged to add a dplyr version:

library(dplyr)
library(stringr)

pos <- 
  # find start end end positions for each pattern
  lapply(my_patterns, function(pat) str_locate_all(dat$x, pat) %>% 
           lapply(as_tibble) %>% 
           bind_rows(.id = "rn")) %>% 
  bind_rows() %>% 
  # collapse overlapping positions
  arrange(rn, start, end) %>% 
  group_by(rn) %>% 
  mutate(grp = cumsum(cummax(lag(end, default = 0)) < start)) %>% 
  group_by(rn, grp) %>% 
  summarize(start = min(start), end = max(end))
# remove patterns from strings from back to front
dat <- dat %>%
  mutate(rn = row_number() %>% as.character(),
         short_x = x %>% as.character())
for (g in rev(seq_len(max(pos$grp)))) {
  dat <- dat %>% 
    left_join(pos %>% filter(grp == g), by = "rn") %>% 
    mutate(short_x = ifelse(is.na(grp), short_x, `str_sub<-`(short_x, start, end, value = ""))) %>% 
    select(-grp, -start, -end)
}
# remove row number
dat %>% 
  select(-rn) %>% 
  mutate(short_x = str_squish(short_x))
                                            x some_other_cols                          short_x
1                     this is my example text               1                        this text
2           and here is my other text example               2                 and here example
3                      my other text is short               2                         is short
4                            yet another text               4                 yet another text
5 this is my text where 'is my' appears twice               5 this text where '' appears twice
6                 my other text is my example               6                                 
7                              This is myself               7                        This self
8          my example is my not my other text               8                              not
9             my example is not my other text               9                           is not

The algorithm is essentially the same. However, there are two challenges here where dplyr differs from data.table:

  • dplyr requires explicit coersion from factor to character
  • there is no update join available in dplyr, so the for loop has become more verbose than the data.table counterpart (Perhaps, someone knows a fancy purrr function or a map-reduce trick to accomplish the same?)

EDIT 2

There are some bug fixes and improvements to above codes:

  1. Collapsing positions has been corrected to work also for some edge case I have added to dat.
  2. seq() has been replaced by seq_len().
  3. str_squish() reduces repeated whitespace inside a string and removes whitespace from start and end of a string.

Data

I have added some use cases to test for non-overlapping patterns and complete removal, e.g.:

dat <- data.frame(
  x = c(
    "this is my example text",
    "and here is my other text example",
    "my other text is short",
    "yet another text",
    "this is my text where 'is my' appears twice",
    "my other text is my example",
    "This myself",
    "my example is my not my other text",
    "my example is not my other text"
  ),
  some_other_cols = c(1, 2, 2, 4, 5, 6, 7, 8, 9)
)
my_patterns <- c("my example", "is my", "my other text")

Tags:

Regex

R

Stringr