小编典典

正则表达式中的StackOverflowError

java

我正在使用正则表达式从多个文档中提取一些字符串。我陷入了针对特定正则表达式的“
StackOverflowError”问题。如果不使用该正则表达式,程序将顺利执行。

我的代码:

 package com.gauge.ie.Annotator;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.print.attribute.Size2DSyntax;

import org.apache.commons.io.FilenameUtils;
import org.apache.uima.util.FileUtils;


public class RecursiveFileDisplay 
{

    static List<String> misclist=new ArrayList<String>();
    static List<String> list=new ArrayList<String>();
    static LinkedHashMap<String,String> authormap=new LinkedHashMap<>();
    static List<String> random=new ArrayList<String>();
    static List<String> benchlist=new ArrayList<String>();
    static LinkedHashMap<String,String> benchmap=new LinkedHashMap<>();
    static List<String> misc1list=new ArrayList<String>();
    String csvfile="/home/gauge/Documents/Docs/madras.csv";



    FileWriter fw;

    public RecursiveFileDisplay()throws IOException 
    {
        fw=new FileWriter("/home/gauge/Documents/Docs/supremecourt.csv",true);
        // TODO Auto-generated constructor stub
    }

    public static void main(String[] args) throws Exception
    {
        RecursiveFileDisplay rsd=new RecursiveFileDisplay();
        File currentDir = new File("/home/gauge/Documents/Docs/SampleData/SupremeCourt"); 
        rsd.readFilesFromDirectory(currentDir);
        System.out.println(benchlist.size());
        System.out.println(list.size());
        System.out.println(random.size());
        rsd.writeCSV();
    }
    public void writeCSV()throws IOException 
    {

        for(String str:list)
        {
            fw.append(str);
            fw.append("\n");
            fw.flush();
        }
        System.out.println("Csv file is done!");

    }
    public  void readFilesFromDirectory(File dir) 
    {
        try
        {
            int i=0;
            Pattern p1=Pattern.compile("(Author):(.*)");
            Pattern p=Pattern.compile("(Bench:)(.*)");
            Pattern p2=Pattern.compile("JUDGMENT(.*?)J[.]");
            Pattern p3=Pattern.compile("(([H|h]on)|(HON)).*((ble)|BLE)(.*)");
            //Pattern p4=Pattern.compile(",\\s*([^,]+),[^,]*\\b(J|JJ)\\.");//\s\w*(?=\w*[,]\sJ[.]*\b)
            Pattern p5=Pattern.compile("\\s\\w*(?=\\w*[,]\\sJ[.]*\\b)");
            Pattern p4=Pattern.compile("\\w*(?=\\w*[,]*\\s*((JJ)|(L.J)|(C.J)|(J))[.]\\s\\b)");
            Pattern p6=Pattern.compile("(BENCH:)((.|\\n)*)(BENCH)((.|\\n)*)(CITATION)");
            File[] listfiles=dir.listFiles();
            for(File file:listfiles)
            {
                if(file.isFile())
                {
                String str="";
                String line="";
                BufferedReader br=new BufferedReader(new FileReader(file));
                while((line=br.readLine())!=null)
                {
                    str+=line+"\n";
                }
                Matcher match=p.matcher(str);
                Matcher match1=p1.matcher(str);
                Matcher match2=p2.matcher(str);
                Matcher match3=p3.matcher(str);
                Matcher match4=p4.matcher(str);
                Matcher match5=p5.matcher(str); 
                Matcher match6=p6.matcher(str);

                 if(match.find())
                 {
                    if(match1.find())
                    {
                        list.add(file.toString()+"\t"+match.group(2)+"\t"+match1.group(2));         //filename,   judgename    ,authorname
                        System.out.println(match1.group(2));
                    }
                    else
                    {
                        list.add(file.toString()+"\t"+match.group(2)+"\t"+" ");
                        System.out.println(match.group(2));
                    }
                 }
                 else if(match1.find())
                 {
                        list.add(file.toString()+"\t"+" "+"\t"+match1.group(2));
                 }
                 else if(match2.find())
                 {
                     list.add(file.toString()+"\t"+match2.group()+"\t"+" ");
                 }
                 else if(match3.find())
                 {
                     list.add(file.toString()+"\t"+match3.group()+"\t"+" ");
                 }
                 else if(match4.find())
                 {
                    //do nothing
                 }
                 else if(match5.find())
                 {
                     list.add(file.toString()+"\t"+match5.group()+"\t"+" ");
                     System.out.println(file.toString());
                 }
                 else if(match6.find())
                 { 
                     System.out.println("lalalalal");
                 }
                 else
                 {
                        misclist.add(file.toString());                          //list of documents which have no Judgenames
                        String name = UUID.randomUUID().toString();
                        PrintWriter pw=new PrintWriter("/home/gauge/Documents/Docs/Misc"+"/"+name);
                        pw.write(str);
                        pw.flush();
                 }

                }
                else if(file.isDirectory())
                {
                    readFilesFromDirectory(file.getAbsoluteFile());
                    System.out.println("recursion");
                }
            }   
        }   
        catch(StackOverflowError soe)
        {
            soe.printStackTrace();
            System.err.print(soe);
        }
        catch (Exception e)
        {
            e.printStackTrace();
            System.err.print(e);
        }
    }

}

当我删除模式p6时,它没有显示任何错误。

stackTrace如下:

at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
    at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658)
    at java.util.regex.Pattern$Loop.match(Pattern.java:4785)
    at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717)
    at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568)
    at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777)
    at java.util.regex.Pattern$Branch.match(Pattern.java:4604)
java.lang.StackOverflowError

阅读 262

收藏
2020-11-26

共1个答案

小编典典

问题来自以下(.|\\n)*部分p6

Pattern p6=Pattern.compile("(BENCH:)((.|\\n)*)(BENCH)((.|\\n)*)(CITATION)");

(.|\\n)*可以在Oracle / OpenJDK JRE上编译为以下结构,其实现使用递归(请注意GroupTail参考资料Loop)来匹配
非确定性 模式的重复(在实现中,交替始终被视为 非确定性 )。

Prolog. Loop wrapper
Loop [732768bb]. Greedy quantifier {0,2147483647}
  GroupHead. (DEBUG) local=0
  Branch. Alternation (in printed order):
    Dot. (?:.), equivalent to [^\n\r\u0085\u2028\u2029]
    ---
    Single. Match code point: U+000A LINE FEED (LF)
    ---
  BranchConn [204d080d]. Connect branches to sequel.
  GroupTail [214b9e0c]. (DEBUG) local=0, group=2. --[next]--> Loop [732768bb]

在长字符串上,堆栈用完,所以得到StackOverflowError

如果要毫无例外地匹配任何字符,则应.Pattern.DOTALLflag
一起单独使用。

  • 您可以将标志传递给Pattern.compile(String regex, int flags)方法以打开整个表达式的标志:

    Pattern p6 = Pattern.compile("(BENCH:)(.*)(BENCH)(.*)(CITATION)", Pattern.DOTALL);
    
  • 或按照Jonny 5的评论中的建议,也可以使用内联标志(?s)

    Pattern p6 = Pattern.compile("(?s)(BENCH:)(.*)(BENCH)(.*)(CITATION)");
    
  • 另外,您也可以打开子模式的标志(?s:.*)

    Pattern p6 = Pattern.compile("(BENCH:)(?s:(.*))(BENCH)(?s:(.*))(CITATION)");
    

顺便说一句,您确定要|onrable参加p3吗?

Pattern p3 = Pattern.compile("(([H|h]on)|(HON)).*((ble)|BLE)(.*)");

如果不需要,请|从字符类中删除:

Pattern p3 = Pattern.compile("(([Hh]on)|(HON)).*((ble)|BLE)(.*)");

我还看到捕获组数量过多。请检查并检查它们是否确实必要。

2020-11-26