我正在使用正则表达式从多个文档中提取一些字符串。我陷入了针对特定正则表达式的“ StackOverflowError”问题。如果不使用该正则表达式,程序将顺利执行。
我的代码:
package com.gauge.ie.Annotator; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.print.attribute.Size2DSyntax; import org.apache.commons.io.FilenameUtils; import org.apache.uima.util.FileUtils; public class RecursiveFileDisplay { static List<String> misclist=new ArrayList<String>(); static List<String> list=new ArrayList<String>(); static LinkedHashMap<String,String> authormap=new LinkedHashMap<>(); static List<String> random=new ArrayList<String>(); static List<String> benchlist=new ArrayList<String>(); static LinkedHashMap<String,String> benchmap=new LinkedHashMap<>(); static List<String> misc1list=new ArrayList<String>(); String csvfile="/home/gauge/Documents/Docs/madras.csv"; FileWriter fw; public RecursiveFileDisplay()throws IOException { fw=new FileWriter("/home/gauge/Documents/Docs/supremecourt.csv",true); // TODO Auto-generated constructor stub } public static void main(String[] args) throws Exception { RecursiveFileDisplay rsd=new RecursiveFileDisplay(); File currentDir = new File("/home/gauge/Documents/Docs/SampleData/SupremeCourt"); rsd.readFilesFromDirectory(currentDir); System.out.println(benchlist.size()); System.out.println(list.size()); System.out.println(random.size()); rsd.writeCSV(); } public void writeCSV()throws IOException { for(String str:list) { fw.append(str); fw.append("\n"); fw.flush(); } System.out.println("Csv file is done!"); } public void readFilesFromDirectory(File dir) { try { int i=0; Pattern p1=Pattern.compile("(Author):(.*)"); Pattern p=Pattern.compile("(Bench:)(.*)"); Pattern p2=Pattern.compile("JUDGMENT(.*?)J[.]"); Pattern p3=Pattern.compile("(([H|h]on)|(HON)).*((ble)|BLE)(.*)"); //Pattern p4=Pattern.compile(",\\s*([^,]+),[^,]*\\b(J|JJ)\\.");//\s\w*(?=\w*[,]\sJ[.]*\b) Pattern p5=Pattern.compile("\\s\\w*(?=\\w*[,]\\sJ[.]*\\b)"); Pattern p4=Pattern.compile("\\w*(?=\\w*[,]*\\s*((JJ)|(L.J)|(C.J)|(J))[.]\\s\\b)"); Pattern p6=Pattern.compile("(BENCH:)((.|\\n)*)(BENCH)((.|\\n)*)(CITATION)"); File[] listfiles=dir.listFiles(); for(File file:listfiles) { if(file.isFile()) { String str=""; String line=""; BufferedReader br=new BufferedReader(new FileReader(file)); while((line=br.readLine())!=null) { str+=line+"\n"; } Matcher match=p.matcher(str); Matcher match1=p1.matcher(str); Matcher match2=p2.matcher(str); Matcher match3=p3.matcher(str); Matcher match4=p4.matcher(str); Matcher match5=p5.matcher(str); Matcher match6=p6.matcher(str); if(match.find()) { if(match1.find()) { list.add(file.toString()+"\t"+match.group(2)+"\t"+match1.group(2)); //filename, judgename ,authorname System.out.println(match1.group(2)); } else { list.add(file.toString()+"\t"+match.group(2)+"\t"+" "); System.out.println(match.group(2)); } } else if(match1.find()) { list.add(file.toString()+"\t"+" "+"\t"+match1.group(2)); } else if(match2.find()) { list.add(file.toString()+"\t"+match2.group()+"\t"+" "); } else if(match3.find()) { list.add(file.toString()+"\t"+match3.group()+"\t"+" "); } else if(match4.find()) { //do nothing } else if(match5.find()) { list.add(file.toString()+"\t"+match5.group()+"\t"+" "); System.out.println(file.toString()); } else if(match6.find()) { System.out.println("lalalalal"); } else { misclist.add(file.toString()); //list of documents which have no Judgenames String name = UUID.randomUUID().toString(); PrintWriter pw=new PrintWriter("/home/gauge/Documents/Docs/Misc"+"/"+name); pw.write(str); pw.flush(); } } else if(file.isDirectory()) { readFilesFromDirectory(file.getAbsoluteFile()); System.out.println("recursion"); } } } catch(StackOverflowError soe) { soe.printStackTrace(); System.err.print(soe); } catch (Exception e) { e.printStackTrace(); System.err.print(e); } } }
当我删除模式p6时,它没有显示任何错误。
stackTrace如下:
at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) at java.util.regex.Pattern$GroupHead.match(Pattern.java:4658) at java.util.regex.Pattern$Loop.match(Pattern.java:4785) at java.util.regex.Pattern$GroupTail.match(Pattern.java:4717) at java.util.regex.Pattern$BranchConn.match(Pattern.java:4568) at java.util.regex.Pattern$CharProperty.match(Pattern.java:3777) at java.util.regex.Pattern$Branch.match(Pattern.java:4604) java.lang.StackOverflowError
问题来自以下(.|\\n)*部分p6:
(.|\\n)*
p6
Pattern p6=Pattern.compile("(BENCH:)((.|\\n)*)(BENCH)((.|\\n)*)(CITATION)");
(.|\\n)*可以在Oracle / OpenJDK JRE上编译为以下结构,其实现使用递归(请注意GroupTail参考资料Loop)来匹配 非确定性 模式的重复(在实现中,交替始终被视为 非确定性 )。
GroupTail
Loop
Prolog. Loop wrapper Loop [732768bb]. Greedy quantifier {0,2147483647} GroupHead. (DEBUG) local=0 Branch. Alternation (in printed order): Dot. (?:.), equivalent to [^\n\r\u0085\u2028\u2029] --- Single. Match code point: U+000A LINE FEED (LF) --- BranchConn [204d080d]. Connect branches to sequel. GroupTail [214b9e0c]. (DEBUG) local=0, group=2. --[next]--> Loop [732768bb]
在长字符串上,堆栈用完,所以得到StackOverflowError。
StackOverflowError
如果要毫无例外地匹配任何字符,则应.与Pattern.DOTALLflag 一起单独使用。
.
Pattern.DOTALL
您可以将标志传递给Pattern.compile(String regex, int flags)方法以打开整个表达式的标志:
Pattern.compile(String regex, int flags)
Pattern p6 = Pattern.compile("(BENCH:)(.*)(BENCH)(.*)(CITATION)", Pattern.DOTALL);
或按照Jonny 5的评论中的建议,也可以使用内联标志(?s):
(?s)
Pattern p6 = Pattern.compile("(?s)(BENCH:)(.*)(BENCH)(.*)(CITATION)");
另外,您也可以打开子模式的标志(?s:.*):
(?s:.*)
Pattern p6 = Pattern.compile("(BENCH:)(?s:(.*))(BENCH)(?s:(.*))(CITATION)");
顺便说一句,您确定要|onrable参加p3吗?
|onrable
p3
Pattern p3 = Pattern.compile("(([H|h]on)|(HON)).*((ble)|BLE)(.*)");
如果不需要,请|从字符类中删除:
|
Pattern p3 = Pattern.compile("(([Hh]on)|(HON)).*((ble)|BLE)(.*)");
我还看到捕获组数量过多。请检查并检查它们是否确实必要。