private void testThreadSafety(TokenFilterFactory factory) throws IOException { final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(); return new TokenStreamComponents(tokenizer, factory.create(tokenizer)); } }; BaseTokenStreamTestCase.checkRandomData(random(), analyzer, 100); }
public void testStandardAnalyzer() throws IOException { Analyzer analyzer = new JiebaAnalyzer(); checkRandomData(new Random(0), analyzer, 1); System.out.println(BaseTokenStreamTestCase.toString(analyzer, "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")); System.out.println("=============="); System.out.println(BaseTokenStreamTestCase.toString(analyzer, "hello world,this is my first program")); System.out.println("=============="); System.out.println(BaseTokenStreamTestCase.toString(analyzer, "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")); }
public void testAnalyzerFactory() throws Exception { String text = "Fortieth, Quarantième, Cuadragésimo"; Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig ("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'", "positionIncrementGap:100,offsetGap:1111," +"MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt')," +"PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\")," +"StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)")); BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, new String[] { "fo", "or", "rt", "ti", "ie", "et", "th", "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe", "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"}); }
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); }
public void testLUCENE1545() throws Exception { /* * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E. * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost. * Expected result is only on token "moͤchte". */ BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); }
public void testApostrophesSA() throws Exception { // internal apostrophes: O'Reilly, you're, O'Reilly's BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"}); }
public void testVariousTextSA() throws Exception { // various BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"}); }
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); BaseTokenStreamTestCase.assertAnalyzesTo(a, input, new String[]{"testing", "1234"}) ; }
public void testLUCENE1545() throws Exception { /* * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTER E. * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost. * Expected result is only one token "moͤchte". */ BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); }
public void testApostrophesSA() throws Exception { // internal apostrophes: O'Reilly, you're, O'Reilly's BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"jim's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"}); }
public void testNumericSA() throws Exception { // floating point, serial, model numbers, ip addresses, etc. BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); }
public void testVariousTextSA() throws Exception { // various BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"}); }
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); }
public void testNumericSA() throws Exception { // floating point, serial, model numbers, ip addresses, etc. BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); }
/** Run a vocabulary test against two data files. */ public static void assertVocabulary(Analyzer a, InputStream voc, InputStream out) throws IOException { BufferedReader vocReader = new BufferedReader( new InputStreamReader(voc, StandardCharsets.UTF_8)); BufferedReader outputReader = new BufferedReader( new InputStreamReader(out, StandardCharsets.UTF_8)); String inputWord = null; while ((inputWord = vocReader.readLine()) != null) { String expectedWord = outputReader.readLine(); Assert.assertNotNull(expectedWord); BaseTokenStreamTestCase.checkOneTerm(a, inputWord, expectedWord); } }
/** Run a vocabulary test against one file: tab separated. */ public static void assertVocabulary(Analyzer a, InputStream vocOut) throws IOException { BufferedReader vocReader = new BufferedReader( new InputStreamReader(vocOut, StandardCharsets.UTF_8)); String inputLine = null; while ((inputLine = vocReader.readLine()) != null) { if (inputLine.startsWith("#") || inputLine.trim().length() == 0) continue; /* comment */ String words[] = inputLine.split("\t"); BaseTokenStreamTestCase.checkOneTerm(a, words[0], words[1]); } }
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); }
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); }