19

I know by using Xeger, we can get a random value for a specified pattern.

String regex = "[0-9]{2}"; 
Xeger generator = new Xeger(regex);
String result = generator.generate();

I want to know is there a way to return all of the valid strings for the specified regex. For example, for pattern: [0-9]{2}, we can get all of the values from 00 to 99.

Thanks

Edit:

Here we don't consider the infinite outputs like + and *; how can we get all values for a finite regex?

Last edit:

Thanks everyone! Finally I don't consider all the possible values as there may be thousands. I limit a specific number as the number of values to reduce the amount.

Eve
  • 785
  • 3
  • 7
  • 15
  • 1
    +1 for question but for most regular expressions number of matching strings is unlimited. For example `[0-9]+` – AlexR Apr 11 '13 at 13:34
  • 1
    This can only work for regexes that only admit finite-length inputs. For example, the `*` and `+` operators are out. Presumably you're OK with this? – NPE Apr 11 '13 at 13:34
  • @NPE You don't have to generate infinitely many values to return a generator that constructs each possible result, throws it out, constructs the next, etc. Think python generators :) – Patashu Apr 11 '13 at 13:39
  • 2
    Even without that limitation, you'll run into trouble pretty darn quickly. Storing all possible strings for `\w{10}` (if `\w` is defined as only ASCII letters/digits/underscore) will require about 43 Petabytes of storage. Enjoy. – Tim Pietzcker Apr 11 '13 at 14:17
  • possible duplicate of [Enumerate Possible Matches of Regular Expression in Java](http://stackoverflow.com/questions/13688778/enumerate-possible-matches-of-regular-expression-in-java) – artbristol Apr 11 '13 at 16:23
  • Can you please upvote the useful answers, and accept the best one? I see that you still have to accept an answer in your other month-long answer... it's not required, but as you can see SO is very limited for low-reputation users, and not everyone of us has a >1000 rep score – berdario Apr 16 '13 at 17:49

4 Answers4

6

Since a regexp is defined by a finite state machine, I wondered if there was something out there able to automatically reason on such machines and that was a good fit to be repurposed for this work... and clojure.core.logic delivered

So, I looked at this definition of the regexp grammar (unfortunately, it lacks the {} quantifiers, but they should be pretty easy to add to my code) adapted it to the java escapes, and worked out this 110 lines long clojure program:

(ns regexp-unfolder.core
  (:require [instaparse.core :as insta])
  (:require [clojure.core.logic :as l])
  (:require [clojure.set :refer [union difference]])
  (:gen-class :methods [#^{:static true} [unfold [String] clojure.lang.LazySeq]])
)

(def parse-regexp (insta/parser 
             "re = union | simple-re?
             union = re '|' simple-re
             simple-re = concat | base-re
             concat = simple-re base-re
             base-re = elementary-re | star | plus
             star = elementary-re '*'
             plus = elementary-re '+'
             elementary-re = group | char | '$' | any | set
             any = '.'
             group = '(' re ')'
             set = positive-set | negative-set
             positive-set = '['  set-items ']'
             negative-set = '[^' set-items ']'
             set-items = set-item*
             set-item = range | char
             range = char '-' char
             char = #'[^\\\\\\-\\[\\]]|\\.'" ))

(def printables (set (map char (range 32 127))))

(declare fns handle-first)

(defn handle-tree [q qto [ type & nodes]]
  (if (nil? nodes)
    [[q [""] qto]]
    ((fns type handle-first) q qto nodes)))

(defn star [q qto node &]
  (cons [q [""] qto]
         (handle-tree q q (first node))))

(defn plus [q qto node &] 
  (concat (handle-tree q qto (first node))
          (handle-tree qto qto (first node))))

(defn any-char [q qto & _] [[q (vec printables) qto]] )

(defn char-range [[c1 _ c2]]
  (let [extract-char (comp int first seq second)]
    (set (map char (range (extract-char c1) (inc (extract-char c2)))))))

(defn items [nodes]
  (union (mapcat
    (fn [[_ [type & ns]]]
      (if (= type :char)
        #{(first ns)}        
        (char-range ns)))
    (rest (second nodes)))))

(defn handle-set [q qto node &] [[q (vec (items node)) qto]])

(defn handle-negset [q qto node &] [[q (vec (difference printables (items node))) qto]])

(defn handle-range [q qto & nodes] [[q (vec (char-range nodes)) qto]])

(defn handle-char [q qto node &] [[q (vec node) qto]] )

(defn handle-concat [q qto nodes] 
  (let [syms (for [x  (rest nodes)] (gensym q))]
    (mapcat handle-tree  (cons q syms) (concat syms [qto] ) nodes)
  ))

(defn handle-first [q qto [node & _]] (handle-tree q qto node))

(def fns {:concat handle-concat, :star star, :plus plus, :any any-char, :positive-set handle-set, :negative-set handle-negset, :char handle-char})

(l/defne transition-membero
  [state trans newstate otransition]
  ([_ _ _ [state trans-set newstate]]
     (l/membero trans trans-set)))

(defn transitiono [state trans newstate transitions]
  (l/conde
   [(l/fresh [f] 
             (l/firsto transitions f)
             (transition-membero state trans newstate f))]
   [(l/fresh [r]
             (l/resto transitions r)
             (transitiono state trans newstate r))])
  )

(declare transitions)

;; Recognize a regexp finite state machine encoded in triplets [state, transition, next-state], adapted from a snippet made by Peteris Erins

(defn recognizeo
  ([input]
     (recognizeo 'q0 input))
  ([q input]
     (l/matche [input] ; start pattern matching on the input
        (['("")]
           (l/== q 'ok)) ; accept the empty string if we are in an accepting state
        ([[i . nput]]
           (l/fresh [qto]
                  (transitiono q i qto transitions) ; assert it must be what we transition to qto from q with input symbol i
                  (recognizeo qto nput)))))) ; recognize the remainder


(defn -unfold [regex] 
  (def transitions 
    (handle-tree 'q0 'ok (parse-regexp regex)))
  (map (partial apply str) (l/run* [q] (recognizeo q))))

Being written with core.logic, it should be fairly easy to adapt it to work also as a regexp matcher

I limited the printables characters from 32 to 126 ascii, otherwise it'd be too cumbersome to deal with regexps such as [^c], but you can extend it quite easily... also, I haven't implemented yet unions, optional patterns, and the \w, \s, etc. escapes for character classes

This is the biggest thing I wrote in clojure up to now, but the basics seems to be covered just fine... some examples:

regexp-unfolder.core=> (-unfold "ba[rz]")
("bar" "baz")
regexp-unfolder.core=> (-unfold "[a-z3-7]")
("a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z" "3" "4" "5" "6" "7")
regexp-unfolder.core=> (-unfold "[a-z3-7][01]")
("a0" "a1" "b0" "b1" "c0" "c1" "d0" "d1" "e0" "e1" "f0" "f1" "g0" "g1" "h0" "h1" "i0" "i1" "j0" "j1" "k0" "k1" "l0" "l1" "m0" "m1" "n0" "n1" "o0" "o1" "p0" "p1" "q0" "q1" "r0" "r1" "s0" "s1" "t0" "t1" "u0" "u1" "v0" "v1" "w0" "w1" "x0" "x1" "y0" "y1" "z0" "z1" "30" "31" "40" "41" "50" "51" "60" "70" "61" "71")
regexp-unfolder.core=> (-unfold "[^A-z]")
(" " "@" "!" "\"" "#" "$" "%" "&" "'" "(" ")" "*" "+" "," "-" "." "/" "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" ":" ";" "{" "<" "|" "=" "}" ">" "~" "?")
regexp-unfolder.core=> (take 20 (-unfold "[abc]*"))
("" "a" "b" "c" "aa" "ab" "ac" "ba" "ca" "aaa" "bb" "cb" "aab" "bc" "cc" "aac" "aba" "aca" "baa" "caa")
regexp-unfolder.core=> (take 20 (-unfold "a+b+"))
("ab" "aab" "abb" "abbb" "aaab" "abbbb" "aabb" "abbbbb" "abbbbbb" "aabbb" "abbbbbbb" "abbbbbbbb" "aaaab" "aabbbb" "aaabb" "abbbbbbbbb" "abbbbbbbbbb" "aabbbbb" "abbbbbbbbbbb" "abbbbbbbbbbbb")

Since I started this way, I implemented also infinite outputs :)

If someone is interested, I uploaded it here

and obviously, here's an example of how to invoke unfold from plain old Java:

import static regexp_unfolder.core.unfold;

public class UnfolderExample{
    public static void main(String[] args){
        @SuppressWarnings("unchecked")
        Iterable<String> strings = unfold("a+b+");
        for (String s : strings){
            System.out.println(s);
        }
    }
}
berdario
  • 1,851
  • 18
  • 29
4

Here is in C language written open-source generator RegLdg - regular expression grammar language dictionary generator.

I believe, it will be not very difficult to make Java port of this program.

Andremoniy
  • 34,031
  • 20
  • 135
  • 241
  • I downloaded the RegLdg but it gives me error while I hit the make all command: collect2: error: ld returned 1 exit status Makefile:21: recipe for target 'all' failed make: *** [all] Error 1 – nzy Oct 26 '17 at 21:22
2

Finding all matches is very similar to finding a random match. Below is a simple modification of the logic that generates random matches on www.debuggex.com, assuming you already have a parse tree.

The idea is that for every subtree, you return a list of all possible generated strings, given a string that was generated by all previous nodes in your parse tree.

AltTree.all = (prefix) ->
    rets = []
    for child in children
        rets.extend(child.all(prefix))

ConcatTree.all = (prefix) ->
    prefixes = [prefix]
    for child in children
        newPrefixes = []
        for p in prefixes
            newPrefixes.extend(child.all(p))
        prefixes = newPrefixes
    return prefixes

RepeatTree.all = (prefix) ->
    prefixes = [prefix]
    rets = []
    for i up to max
        newPrefixes = []
        for p in prefixes
            newPrefixes.extend(onlyChild.all(p))
        prefixes = newPrefixes
        if i >= min
            rets.extend(prefixes)
    return rets

CharsetTree.all = (prefix) ->
    rets = []
    for char in allValidChars():
        rets.push(prefix + char)
    return rets

The rest of the trees are left as exercises (most notably the literal tree).

Note that there are intentionally no optimizations for the sake of clarity. Calling myTree.all('') will generate a list such that every valid matching string appears once for every path that generates this string. You will probably want to add deduplication and get rid of the excessive copying.

I should also add that this will only work for regular expressions that have a small number of total matching strings. This is because all of the strings are being stored. If you want to get around this limitation, you can yieldify this algorithm. You will need to maintain a stack (think of it as being a bread crumb trail) of where you are in the tree. When a new string is asked for, you will create it from the path you travelled, and then update the path.

Sergiu Toarca
  • 2,697
  • 20
  • 24
0

A trivial implementation of such algorithm is simply:

def generate_matching(pattern):
    alphabets = [...]
    l = 1
    while True:
        # generate all Cartesian product of the alphabets of length `l`
        for s in itertools.product(alphabets, repeat=l):
            s = "".join(s)
            if pattern.match(s):
                print s
        l += 1
Lie Ryan
  • 62,238
  • 13
  • 100
  • 144
  • 3
    Isn't the hard part of this generating the `alphabets` in such a way that you're not throwing away over 99.9% of the work you're doing? Doesn't this basically just generate every possible string and compare it to the regex? – Ian McLaird Apr 11 '13 at 14:47