Finding specific strings in an array using R

Question

I am trying to find if there is a quick way to search for specific strings in arrays in R, kind of like the game Boggle, except you know the word upfront.

You are allowed to move in the following directions for the next letter of the string: up, down, right or left

Say for a simple example you have an array of the form:

> G    
A, Q, A, Q, Q,  
A, Q, P, Q, Q,   
Q, Q, P, L, Q,   
Q, Q, Q, E, Q

And you want to apply a function to G with the string APPLE, for the function to return TRUE, APPLE exists in this array, and FALSE if it doesn't.

Does there exist a pre-made function or package that can do this, or alternatively is there a smart way to do it, I'm relatively new to dealing with strings in R and I'm struggling to see a way.

Any help much appreciated. Thanks.

Welcome to StackOverflow. Please take a look at these tips on how to produce a [minimum, complete, and verifiable example](http://stackoverflow.com/help/mcve), as well as this post on [creating a great example in R](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example). Perhaps the following tips on [asking a good question](http://stackoverflow.com/help/how-to-ask) may also be worth a read. — lmo, Jan 06 '17 at 14:44

manotheshark · Answer 1 · 2017-01-06T15:26:35.193

this will first check if there are any characters in your word that do not exist within the array and then will check if the number of characters in the array are sufficient to meet repeat letters in your word

word <- strsplit("APPLE", "")
pool <- c("A", "Q", "A", "Q",
          "Q", "A", "Q", "P",
          "Q", "Q", "Q", "Q",
          "P", "L", "Q", "Q",
          "Q", "Q", "E", "Q")

t.word <- table(word)
t.pool <- table(pool)

length(setdiff(names(t.word), names(t.pool))) == 0
min(t.pool[names(t.word)] - t.word) >= 0

the last two functions will both output TRUE to show that all the letters from word exist in pool and that the count of a single letter in word is not greater than that of pool

in function form that will output TRUE if found, otherwise FALSE

word.find <- function(word, pool) {
  t.word <- table(strsplit(word, ""))
  t.pool <- table(pool)
  length(setdiff(names(t.word), names(t.pool))) == 0 & min(t.pool[names(t.word)] - t.word) >= 0
}

word.find("APPLE", pool)
[1] TRUE

word.find("APPLES", pool)
[1] FALSE

word.find("APPLEE", pool)
[1] FALSE

d.b · Answer 2 · 2017-01-20T20:59:49.997

This function works using only base R

THE FUNCTION

search_string = function(matrix_array, word_to_search){

    position = data.frame(NA,NA,NA) #Create empty dataframe

    word_to_search_inv = sapply(lapply(strsplit(word_to_search, NULL), rev), paste, collapse="") #Reverse word_to_search

    for (i in 1:nrow(matrix_array)){
        str_row = paste((matrix_array[i,]),collapse = "") #Collapse entire row into a string
        if (grepl(word_to_search,str_row)) { #Check if the word_to_search is in the string towards right
            position = rbind(position,c(i,paste(gregexpr(word_to_search, str_row)[[1]], collapse = ', '),"RIGHT")) #Get position and add it to the dataframe      
        }
        if (grepl(word_to_search_inv,str_row)) {#Check if the word_to_search is in the string towards left (by checking for reverse of word_to_search)
            position = rbind(position,c(i,paste(gregexpr(word_to_search_inv, str_row)[[1]], collapse = ', '),"LEFT"))       
        }
    }

    for (j in 1:ncol(matrix_array)){        
        str_column = paste((matrix_array[,j]),collapse = "")
        if (grepl(word_to_search, str_column)) { #Check if the word_to_search is in the string towards down
            position = rbind(position, c(paste(gregexpr(word_to_search, str_column)[[1]], collapse = ', '),j,"DOWN"))
        }
        if (grepl(word_to_search_inv, str_column)) { #Check if the word_to_search is in the string towards up
            position = rbind(position, c(paste(gregexpr(word_to_search_inv, str_column)[[1]], collapse = ', '),j,"UP"))
        }
    }

    colnames(position) = c("ROW","COLUMN","DIRECTION")
    position = position[c(2:nrow(position)),]
    rownames(position) = NULL
    return(position) #Return the datafram containing row, columnm, and direction where word_to_match is found
}

USAGE

#Data
mydata = structure(c("A", "A", "Q", "Q", "D", "Q", "Q", "Q", "Q", "B", 
                     "A", "P", "P", "L", "E", "Q", "Q", "L", "E", "S", "Q", "Q", "Q", 
                     "Q", "T", "A", "P", "P", "L", "E"), .Dim = c(5L, 6L), .Dimnames = list(NULL, c("V1", "V2", 
                                                                           "V3", "V4", "V5", "V6")))

key = "APPLE"

#Run the function
pos = search_string(mydata,key)

Thanks both. This works if the word is in a straight line, but doesn't if the word "moves round corners", do you know of way to do this? — user2915209, Jan 07 '17 at 18:49

score 1 · Answer 3 · answered Jan 07 '17 at 19:39

Adding another approach, having:

board = structure(c("A", "A", "Q", "Q", "Q", "Q", "Q", "Q", "A", "P", 
"P", "Q", "Q", "Q", "L", "E", "Q", "Q", "Q", "Q"), .Dim = 4:5, .Dimnames = list(
    NULL, NULL))

word = "APPLE"

we start with:

matches = lapply(strsplit(word, NULL)[[1]], function(x) which(x == board, arr.ind = TRUE))

which is a simple -probably unavoidable- search of indices of "board" that match each letter of the word. It's a "list" containing the row/col indices like:

#[[1]]
#     row col
#[1,]   1   1
#[2,]   2   1
#[3,]   1   3
#
#[[2]]
#     row col
#[1,]   2   3
#[2,]   3   3
#
##.....

Having that, we need to find out, progressively, whether an index in each element has a neighbour (i.e. the right/left/up/down cell) in the next element. E.g. we need something like:

as.matrix(find_neighbours(matches[[1]], matches[[2]], dim(board)))
#      [,1]  [,2]
#[1,] FALSE FALSE
#[2,] FALSE FALSE
#[3,]  TRUE FALSE

which informs us, that the row 3 of matches[[1]] is a neighbour of row 1 of matches[[2]], i.e. [1, 3] and [2, 3] are, indeed, neighbouring cells. We need this for each successive element in "matches":

are_neighs = Map(function(x, y) which(find_neighbours(x, y, dim(board)), TRUE), 
                 matches[-length(matches)], matches[-1])
are_neighs
#[[1]]
#     [,1] [,2]
#[1,]    3    1
#
#[[2]]
#     [,1] [,2]
#[1,]    2    1
#[2,]    1    2
#
#[[3]]
#     [,1] [,2]
#[1,]    2    1
#
#[[4]]
#     [,1] [,2]
#[1,]    1    1

Now that we have the pairwise ("i" with "i + 1") neighbour matches we need to complete the chain. For this example we'd like to have a vector like c(1, 2, 1, 1) which contains the info that the row 1 of are_neighs[[1]] is chained with the row 2 of are_neighs[[2]] which is chained with row 1 of are_neighs[[3]] which is chained with row 1 of are_neighs[[4]]. This smells like an "igraph" problem, but I'm not so familiar with it (hopefully someone has a better idea), so here's a naive approach to get that chaining:

row_connections = matrix(NA_integer_, nrow(are_neighs[[1]]), length(are_neighs))
row_connections[, 1] = 1:nrow(are_neighs[[1]])
cur = are_neighs[[1]][, 2]
for(i in 1:(length(are_neighs) - 1)) {
    im = match(cur, are_neighs[[i + 1]][, 1]) 
cur = are_neighs[[i + 1]][, 2][im]
row_connections[, i + 1] = im
}
row_connections = row_connections[complete.cases(row_connections), , drop = FALSE]

Which returns:

row_connections
#     [,1] [,2] [,3] [,4]
#[1,]    1    2    1    1

Having this vector, now, we can extract the respective chain from "are_neighs":

Map(function(x, i) x[i, ], are_neighs, row_connections[1, ])
#[[1]]
#[1] 3 1
#
#[[2]]
#[1] 1 2
#
#[[3]]
#[1] 2 1
#
#[[4]]
#[1] 1 1

which can be used to extract the appropriate row/col chain of indices from "matches":

ans = vector("list", nrow(row_connections))
for(i in 1:nrow(row_connections)) {
     connect = Map(function(x, i) x[i, ], are_neighs, row_connections[i, ])
     ans[[i]] = do.call(rbind, Map(function(x, i) x[i, ], matches, c(connect[[1]][1], sapply(connect, "[", 2))))
}
ans
#[[1]]
#     row col
#[1,]   1   3
#[2,]   2   3
#[3,]   3   3
#[4,]   3   4
#[5,]   4   4

Wrapping it all in a function (find_neighbours is defined inside):

library(Matrix)
ff = function(word, board)
{
    matches = lapply(strsplit(word, NULL)[[1]], function(x) which(x == board, arr.ind = TRUE))

    find_neighbours = function(x, y, d)
    {
        neighbours = function(i, j, d = d) 
        {
            ij = rbind(cbind(i, j + c(-1L, 1L)), cbind(i + c(-1L, 1L), j))
            ijr = ij[, 1]; ijc = ij[, 2]
            ij = ij[((ijr > 0L) & (ijr <= d[1])) & ((ijc > 0L) & (ijc <= d[2])), ]

            ij[, 1] + (ij[, 2] - 1L) * d[1]
        }

        x.neighs = lapply(1:nrow(x), function(i) neighbours(x[i, 1], x[i, 2], dim(board)))
        y = y[, 1] + (y[, 2] - 1L) * d[1]

        x.sparse = sparseMatrix(i = unlist(x.neighs), 
                                j = rep(seq_along(x.neighs), lengths(x.neighs)), 
                                x = 1L, dims = c(prod(d), length(x.neighs)))
        y.sparse = sparseMatrix(i = y, j = seq_along(y), x = 1L, dims = c(prod(d), length(y)))                         

        ans = crossprod(x.sparse, y.sparse, boolArith = TRUE)

        ans
    }      

    are_neighs = Map(function(x, y) which(find_neighbours(x, y, dim(board)), TRUE), matches[-length(matches)], matches[-1])

    row_connections = matrix(NA_integer_, nrow(are_neighs[[1]]), length(are_neighs))
    row_connections[, 1] = 1:nrow(are_neighs[[1]])
    cur = are_neighs[[1]][, 2]
    for(i in 1:(length(are_neighs) - 1)) {
        im = match(cur, are_neighs[[i + 1]][, 1]) 
        cur = are_neighs[[i + 1]][, 2][im]
        row_connections[, i + 1] = im
    }
    row_connections = row_connections[complete.cases(row_connections), , drop = FALSE]

    ans = vector("list", nrow(row_connections))
    for(i in 1:nrow(row_connections)) {
        connect = Map(function(x, i) x[i, ], are_neighs, row_connections[i, ])
        ans[[i]] = do.call(rbind, Map(function(x, i) x[i, ], matches, c(connect[[1]][1], sapply(connect, "[", 2))))
    }
    ans
}

We can try it:

ff("APPLE", board)
#[[1]]
#     row col
#[1,]   1   3
#[2,]   2   3
#[3,]   3   3
#[4,]   3   4
#[5,]   4   4

And with more than one matches:

ff("AQQP", board)
#[[1]]
#     row col
#[1,]   1   1
#[2,]   1   2
#[3,]   2   2
#[4,]   2   3
#
#[[2]]
#     row col
#[1,]   1   3
#[2,]   1   2
#[3,]   2   2
#[4,]   2   3
#
#[[3]]
#     row col
#[1,]   1   3
#[2,]   1   4
#[3,]   2   4
#[4,]   2   3

Although, it's flexible in returning multiple matches, it does not return all possible matches and, in a nutshell, that's because of the use of match when building the chain of neighbours -- a linear search could be used instead, but -at the moment- adds significant code complexity.

score 0 · Answer 4 · answered Jan 20 '17 at 20:06

I've written the below, and it works well and quickly, as well as being translatable to any other languages.

Given a graph G, and a dictionary, it searches through the dictionary and then tests whether G has any letters that correspond to the 1st letter of each word it needs to check. Next, it checks whether any of the neighbours, found by the indices of TRUE values + delta, of the TRUE values of the previous are equal the 2nd of the word. And this continues.

If at any point this is found not to be TRUE, the function ends and returns FALSE. Also, if you sort your dictionary by "rareness" of letter combinations, the function will work much quicker.

#function to check if a word appears in a graph
dict_check <- function(dictionary, G) {

#Run thru dictionary and check if word is G
#If at any point after a word check, it doesn't appear, break and return FALSE

n <- length(dictionary)
count_1 <- 0    #sum of words checked
count_2 <- 0    #sum of words successfully found
delta <- matrix(c(-1,  0, 1, 0, 
                   0, -1, 0, 1), 
                   byrow = T, nrow = 4, ncol = 2)

for (dc in 1:n) {
word <- dictionary[dc]

#Add 1 for each word checked
count_1 <- count_1 + 1

#Split word into a vector
W <- unlist(strsplit(word, ""))

#Boolean matrix for 1st letter of word, if not there, end and return False
G_bool <- G == W[1]
if(sum(G_bool) == 0) {
  return(FALSE)
}

#Fetch indices of True values for 1st letter of word
I <- which(G_bool == T, arr.ind = T)

#Loop thru word and check if neighbours match next letter of word,
#for all letters of word
#if at any point after iteration of a letter in word whereby G is all False,
#return False for word_check

last <- length(W)
for (w in 2:last) {

  #For each index in I, check if wordbox range, 
  #and check if neighbours ar equal to W[2, ...]
  for (i in 1:nrow(I)) {
    for (d in 1:nrow(delta)) {
      #neighbour
      k <- I[i, ] + delta[d, ]

      #If neighbour is out of bounds of box then move onto next neighbour
      #Each valid neighbour checked if is equal to next letter of word
      #If it is equal set to neighbour to TRUE, and original position to FALSE
      #If neighbour doesn't equal next letter, make original position FALSE anyway
      G_bool[I[i, 1], I[i, 2]] <- FALSE   #Set original position to FALSE
      if (k[1] == 0 | k[1] > nrow(G) | k[2] == 0 | k[2] > ncol(G)) {
        next} else if (G[k[1], k[2]] == W[w]) {
          G_bool[k[1], k[2]] <- TRUE    #Set neighbour to TRUE
        } 
      }
    }
    #Check after each iteration of letter if any letters of subsequent 
    #letters appear, if yes, continue to next letter of word, if no, return
    #FALSE for word check
    if (sum(G_bool) == 0) {
      return(FALSE)
    }
    #Update indices I for next TRUE in G_bool, corresponding to next letters found
    I <- which(G_bool == T, arr.ind = T)
  }
  #Final check after word iteration is complete on G_bool
  if (sum(G_bool) == 0) {
    return(FALSE)
  } else if (sum(G_bool) > 0) {
    count_2 <- count_2 + 1    #Add 1 to count_2 if word successfully found
  }
  if (count_1 != count_2) {
    return(FALSE)
  } 
  }
  #Final check
  if (count_1 != count_2) {
  return(FALSE)
  } else
  return(TRUE)
  }

Finding specific strings in an array using R

4 Answers4