2

I am extracting some strings from several documents using regular expressions. I am stuck at this "StackOverflowError" which comes for a specific regular expression.Without using that regex, the program executes smoothly.

My Code:

 package com.gauge.ie.Annotator;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.print.attribute.Size2DSyntax;

import org.apache.commons.io.FilenameUtils;
import org.apache.uima.util.FileUtils;


public class RecursiveFileDisplay 
{

    static List<String> misclist=new ArrayList<String>();
    static List<String> list=new ArrayList<String>();
    static LinkedHashMap<String,String> authormap=new LinkedHashMap<>();
    static List<String> random=new ArrayList<String>();
    static List<String> benchlist=new ArrayList<String>();
    static LinkedHashMap<String,String> benchmap=new LinkedHashMap<>();
    static List<String> misc1list=new ArrayList<String>();
    String csvfile="/home/gauge/Documents/Docs/madras.csv";



    FileWriter fw;

    public RecursiveFileDisplay()throws IOException 
    {
        fw=new FileWriter("/home/gauge/Documents/Docs/supremecourt.csv",true);
        // TODO Auto-generated constructor stub
    }

    public static void main(String[] args) throws Exception
    {
        RecursiveFileDisplay rsd=new RecursiveFileDisplay();
        File currentDir = new File("/home/gauge/Documents/Docs/SampleData/SupremeCourt"); 
        rsd.readFilesFromDirectory(currentDir);
        System.out.println(benchlist.size());
        System.out.println(list.size());
        System.out.println(random.size());
        rsd.writeCSV();
    }
    public void writeCSV()throws IOException 
    {

        for(String str:list)
        {
            fw.append(str);
            fw.append("\n");
            fw.flush();
        }
        System.out.println("Csv file is done!");

    }
    public  void readFilesFromDirectory(File dir) 
    {
        try
        {
            int i=0;
            Pattern p1=Pattern.compile("(Author):(.*)");
            Pattern p=Pattern.compile("(Bench:)(.*)");
            Pattern p2=Pattern.compile("JUDGMENT(.*?)J[.]");
            Pattern p3=Pattern.compile("(([H|h]on)|(HON)).*((ble)|BLE)(.*)");
            //Pattern p4=Pattern.compile(",\\s*([^,]+),[^,]*\\b(J|JJ)\\.");//\s\w*(?=\w*[,]\sJ[.]*\b)
            Pattern p5=Pattern.compile("\\s\\w*(?=\\w*[,]\\sJ[.]*\\b)");
            Pattern p4=Pattern.compile("\\w*(?=\\w*[,]*\\s*((JJ)|(L.J)|(C.J)|(J))[.]\\s\\b)");
            Pattern p6=Pattern.compile("(BENCH:)((.|\\n)*)(BENCH)((.|\\n)*)(CITATION)");
            File[] listfiles=dir.listFiles();
            for(File file:listfiles)
            {
                if(file.isFile())
                {
                String str="";
                String line="";
                BufferedReader br=new BufferedReader(new FileReader(file));
                while((line=br.readLine())!=null)
                {
                    str+=line+"\n";
                }
                Matcher match=p.matcher(str);
                Matcher match1=p1.matcher(str);
                Matcher match2=p2.matcher(str);
                Matcher match3=p3.matcher(str);
                Matcher match4=p4.matcher(str);
                Matcher match5=p5.matcher(str); 
                Matcher match6=p6.matcher(str);

                 if(match.find())
                 {
                    if(match1.find())
                    {
                        list.add(file.toString()+"\t"+match.group(2)+"\t"+match1.group(2));         //filename,   judgename    ,authorname
                        System.out.println(match1.group(2));
                    }
                    else
                    {
                        list.add(file.toString()+"\t"+match.group(2)+"\t"+" ");
                        System.out.println(match.group(2));
                    }
                 }
                 else if(match1.find())
                 {
                        list.add(file.toString()+"\t"+" "+"\t"+match1.group(2));
                 }
                 else if(match2.find())
                 {
                     list.add(file.toString()+"\t"+match2.group()+"\t"+" ");
                 }
                 else if(match3.find())
                 {
                     list.add(file.toString()+"\t"+match3.group()+"\t"+" ");
                 }
                 else if(match4.find())
                 {
                    //do nothing
                 }
                 else if(match5.find())
                 {
                     list.add(file.toString()+"\t"+match5.group()+"\t"+" ");
                     System.out.println(file.toString());
                 }
                 else if(match6.find())
                 { 
                     System.out.println("lalalalal");
                 }
                 else
                 {
                        misclist.add(file.toString());                          //list of documents which have no Judgenames
                        String name = UUID.randomUUID().toString();
                        PrintWriter pw=new PrintWriter("/home/gauge/Documents/Docs/Misc"+"/"+name);
                        pw.write(str);
                        pw.flush();
                 }

                }
                else if(file.isDirectory())
                {
                    readFilesFromDirectory(file.getAbsoluteFile());
                    System.out.println("recursion");
                }
            }   
        }   
        catch(StackOverflowError soe)
        {
            soe.printStackTrace();
            System.err.print(soe);
        }
        catch (Exception e)
        {
            e.printStackTrace();
            System.err.print(e);
        }
    }

}

when i remove pattern p6, it doesn't show any error.

The stackTrace is as follows:

at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
java.lang.StackOverflowError
Narendra Rawat
  • 353
  • 2
  • 5
  • 17
  • Did you try with DOTALL for p6? `(?s)(BENCH:)(.*)(BENCH)(.*)(CITATION)` This looks shonky: `(.|\\n)*` why use? – Jonny 5 Jul 28 '15 at 12:47
  • Could improve performance by making the first quantifier lazy - if this would not affect your result: `(?s)(BENCH:)(.*?)(BENCH)(.*)(CITATION)` – Jonny 5 Jul 28 '15 at 12:55
  • I had tried what you have suggested, but it doesn't work for java. – Narendra Rawat Jul 28 '15 at 17:48
  • @AvinashRaj: The answers on the other question doesn't give a solution, so I have reopened the question. – nhahtdh Jul 29 '15 at 03:37

2 Answers2

8

The problem comes from (.|\\n)* part of p6:

Pattern p6=Pattern.compile("(BENCH:)((.|\\n)*)(BENCH)((.|\\n)*)(CITATION)");

(.|\\n)* compiles into the following structure on Oracle/OpenJDK JRE, whose implementation uses recursion (note GroupTail goes back to Loop) to match repetition of non-deterministic pattern (alternation is always considered non-deterministic in the implementation).

Prolog. Loop wrapper
Loop [732768bb]. Greedy quantifier {0,2147483647}
  GroupHead. (DEBUG) local=0
  Branch. Alternation (in printed order):
    Dot. (?:.), equivalent to [^\n\r\u0085\u2028\u2029]
    ---
    Single. Match code point: U+000A LINE FEED (LF)
    ---
  BranchConn [204d080d]. Connect branches to sequel.
  GroupTail [214b9e0c]. (DEBUG) local=0, group=2. --[next]--> Loop [732768bb]

On a long string, the stack runs out, so you get StackOverflowError.

If you want to match any character without exception, then you should use . alone in combination with Pattern.DOTALL flag.

  • You can pass the flag to Pattern.compile(String regex, int flags) method to turn on the flag for the whole expression:

    Pattern p6 = Pattern.compile("(BENCH:)(.*)(BENCH)(.*)(CITATION)", Pattern.DOTALL);
    
  • Or as suggested in the comment by Jonny 5, you can also use the inline flag (?s):

    Pattern p6 = Pattern.compile("(?s)(BENCH:)(.*)(BENCH)(.*)(CITATION)");
    
  • Alternatively, you can also turn on the flag for a sub-pattern (?s:.*):

    Pattern p6 = Pattern.compile("(BENCH:)(?s:(.*))(BENCH)(?s:(.*))(CITATION)");
    

By the way, are you sure you want to match |onrable in p3?

Pattern p3 = Pattern.compile("(([H|h]on)|(HON)).*((ble)|BLE)(.*)");

If you don't want it, remove | from the character class:

Pattern p3 = Pattern.compile("(([Hh]on)|(HON)).*((ble)|BLE)(.*)");

I also see an excessive amount of capturing group. Please review and check whether they are really necessary.

nhahtdh
  • 55,989
  • 15
  • 126
  • 162
2

The branching in (.|\n)* is adding to the stack with each character captured. Your string to match is long enough that your stack overflows.

One option would be to just change this to .* and use the option DOTALL. Another is to take a long hard look at what you're trying to capture, from what, why, and then use different regular expressions to the same effect or build your own simple state machine to scan through a stream of characters.

It looks like you're either greping recursively in a directory (maybe use grep) or trying to parse out something (maybe build a parser, like with javacc).

Use a StringBuilder to concatenate a lot of strings. Better yet use new String(Files.readAllBytes(path), charset) to avoid the hassle and garbage collection of each line as its own string.

Remove any grouping that you don't actually need for branching or capture purposes. Any branching group that need not be captured should begin with ?:. You might like to use named groups for the rest. Test out your regular expressions in a tool first, maybe even one that can write a snippet for you.

Consider the meaning of | when inside a square bracketed character class, and the meaning of similarly making a bracketed class of only one character like . or ,

Pattern.compile is a static method, it need only be used once per pattern, rather than with each recursion, by say, pulling those lines out of the method and into the class with [I've edited p6]:

private static Pattern p = Pattern.compile("Bench:(.*)");
private static Pattern p1 = Pattern.compile("Author:(.*)");
private static Pattern p2 = Pattern.compile("JUDGMENT(.*?)J\\.");
private static Pattern p3 = Pattern.compile("[Hh](on|ON).*(ble|BLE)(.*)");
private static Pattern p4 = Pattern.compile(",\\s*([^,]+),[^,]*\\b(J|JJ)\\.");
private static Pattern p5 = Pattern.compile("\\s\\w*(?=\\w*,\\sJ\\.*\\b)"); //? [.]* ?
private static Pattern p4 =
    Pattern.compile("\\w*(?=\\w*,*\\s*(JJ|L.J|C.J|J).\\s\\b)"); //? [,]* ?
private static Pattern p6 =
    Pattern.compile("BENCH:.*?BENCH.*?CITATION", Pattern.DOTALL);

You'd see stuff like "path\dir"+"\"+name and "\t"+" " a lot more clearly if you first got into a habit of writing: "path\dir" + "\" + name and "\t" + " " and then appropriately combined parts to "path\dir\" + name and "\t ".

Lastly, I'd scope the matchers and change the bracing format, that's probably just me:

package com.you.take.me.to.funky;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.nio.path.Files;
import java.nio.path.Path;
import java.nio.path.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class town {

  private static List<String> list = new ArrayList<>();
  private static List<String> misclist = new ArrayList<>();

  private static Pattern p0 = Pattern.compile("(Bench:)(.*)");
  private static Pattern p1 = Pattern.compile("(Author):(.*)");
  private static Pattern p2 = Pattern.compile("JUDGMENT(.*?)J[.]");
  private static Pattern p3 = Pattern.compile("(([H|h]on)|(HON)).*((ble)|BLE)(.*)");
  private static Pattern p4 = Pattern.compile(",\\s*([^,]+),[^,]*\\b(J|JJ)\\.");//CAKE
  private static Pattern p5 = Pattern.compile("\\s\\w*(?=\\w*[,]\\sJ[.]*\\b)");
  private static Pattern p4 =
      Pattern.compile("\\w*(?=\\w*[,]*\\s*((JJ)|(L.J)|(C.J)|(J))[.]\\s\\b)");
  private static Pattern p6 =
      Pattern.compile("BENCH:.*?BENCH.*?CITATION", Pattern.DOTALL);

  public static void main(String[] args) {
    Path path = Paths.get(args[0]);
    String str;
    try {
      str = new String(Files.readAllBytes(path), StandardCharsets.UTF_8);
    } catch (IOException e) {
      e.printStackTrace();
    }

    Matcher match = p0.matcher(str);
    if (match.find()) {
      Matcher match1 = p1.matcher(str);
      if (match1.find()) {
        // pathname  judgename  authorname
        list.add(path.toString() + 
                 "\t" + match.group(2) + 
                 "\t" + match1.group(2));
        System.out.println(match1.group(2));
      } else {
        list.add(path.toString() + "\t" + match.group(2) + "\t ");
        System.out.println(match.group(2));
      }
    } else {
      match = p1.matcher(str);
      if (match.find()) {
        list.add(path.toString() + "\t \t" + match.group(2));
      } else {
        match = p2.matcher(str);
        if (match.find()) {
          list.add(path.toString() + "\t" + match.group() + "\t ");
        } else {
          match = p3.matcher(str);
          if (match.find()) {
            list.add(path.toString() + "\t" + match.group() + "\t ");
          } else {
            match = p4.matcher(str);
            if (match.find()) {
              //do nothing
            } else {
              match = p5.matcher(str);
              if (match.find()) {
                list.add(path.toString() + "\t" + match.group() + "\t ");
                System.out.println(path.toString());
              } else {
                match = p6.matcher(str);
                if (match.find()) {
                  System.out.println("DEBUG MARKER");
                } else {
                  // list of documents which have no Judgenames
                  misclist.add(path.toString());
                  String name = UUID.randomUUID().toString();
                  try {
                    PrintWriter pw = new PrintWriter("/h/g/d/d/m/" + name);
                    pw.write(str);
                    pw.flush();
                  } catch (FileNotFoundException e) {
                    e.printStackTrace();
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}
dlamblin
  • 43,965
  • 20
  • 101
  • 140