public WikipediaTokenizer testHandwritten() throws Exception { // make sure all tokens are in only one type /*String test = "[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] " + "Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] " + "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' " + " This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. " + "==heading== ===sub head=== followed by some text [[Category:blah| ]] " + "''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed." + "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this" + " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]" + " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>"; */ String wikiTest = "{{other uses}}\n" + "{{Anarchism sidebar |Theory}}\n" + "{{Forms of government}}\n" + "'''Anarchy''' has more than one definition. Some use the term \"Anarchy\" to refer to a society without a publicly enforced government.<ref>{{cite web|url=http://www.amazon.com/dp/1551642484 |title=Decentralism: Where It Came From-Where Is It Going? |publisher=Amazon.com |date= |accessdate=2012-01-30}}</ref><ref>\"Anarchy.\" Oxford English Dictionary. Oxford University Press. 2004. The first quoted usage is 1667</ref> When used in this sense, anarchy may<ref>\"Anarchy.\" Oxford English Dictionary. Oxford University Press. 2004. The first quoted usage is 1552</ref> or may not<ref name=\"Anarchy 2004\">\"Anarchy.\" Oxford English Dictionary. [[Oxford University Press]]. 2004. The first quoted usage is 1850.</ref> be intended to imply political disorder or [[Civil disorder|lawlessness]] within a society. Many anarchists complain with [[Anselme Bellegarrigue]] that \"[v]ulgar error has taken 'anarchy' to be synonymous with 'civil war.{{' \"}}<ref>{{Cite web | title = Lexical Investigations: Anarchy | url = http://hotword.dictionary.com/anarchy/ | accessdate = 11 November 2013 | work = The Hot Word | publisher = [[Reference.com|dictionary.com]] }}</ref>\n" + "\n" + "== Etymology ==\n" + "The word ''anarchy'' comes from the [[ancient Greek]] ἀναρχία, ''anarchia'', from [[:wikt:ἀν-|ἀν]] ''an'', \"not, without\" + [[:wikt:ἀρχός|ἀρχός]] ''arkhos'', \"ruler\", meaning \"absence of a ruler\", \"without rulers\").<ref>\"anarchy, n.\". OED Online. December 2012. Oxford University Press. http://www.oed.com/view/Entry/7118?redirectedFrom=anarchy& (accessed January 17, 2013).</ref>"; return new WikipediaTokenizer(new StringReader(wikiTest)); }
public void testTokenizer() throws Exception { Reader reader = new StringReader("This is a [[Category:foo]]"); Tokenizer tokenizer = tokenizerFactory("Wikipedia").create(newAttributeFactory(), reader); assertTokenStreamContents(tokenizer, new String[] { "This", "is", "a", "foo" }, new int[] { 0, 5, 8, 21 }, new int[] { 4, 7, 9, 24 }, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", WikipediaTokenizer.CATEGORY }, new int[] { 1, 1, 1, 1, }); }
public void testTokenizer() throws IOException { Reader reader = new StringReader("This is a [[Category:foo]]"); WikipediaTokenizerFactory factory = new WikipediaTokenizerFactory(); Tokenizer tokenizer = factory.create(reader); assertTokenStreamContents(tokenizer, new String[] { "This", "is", "a", "foo" }, new int[] { 0, 5, 8, 21 }, new int[] { 4, 7, 9, 24 }, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", WikipediaTokenizer.CATEGORY }, new int[] { 1, 1, 1, 1, }); }
public void testTokenizer() throws Exception { Reader reader = new StringReader("This is a [[Category:foo]]"); Tokenizer tokenizer = tokenizerFactory("Wikipedia").create(reader); assertTokenStreamContents(tokenizer, new String[] { "This", "is", "a", "foo" }, new int[] { 0, 5, 8, 21 }, new int[] { 4, 7, 9, 24 }, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", WikipediaTokenizer.CATEGORY }, new int[] { 1, 1, 1, 1, }); }
@Override public Tokenizer create(Reader input) { return new WikipediaTokenizer(input); }
public WikipediaTokenizer testSimple() throws Exception { String text = "This is a [[Category:foo]]"; return new WikipediaTokenizer(new StringReader(text)); }
public WikipediaTokenizer testLinks() throws Exception { String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]"; return new WikipediaTokenizer(new StringReader(test)); }
public static void main(String[] args){ WikipediaTokenizerTest wtt = new WikipediaTokenizerTest(); try { /* // first test WikipediaTokenizer wikiTokenizer = wtt.testHandwritten(); wikiTokenizer.reset(); int i = 0; while (wikiTokenizer.incrementToken()) { logger.info("seen something"); i++; } wikiTokenizer.end(); wikiTokenizer.close(); // output how many tokens I have seen logger.info("I have seen " + i + " tokens!"); */ WikipediaTokenizer wikiTokenizer = wtt.testHandwritten(); wikiTokenizer.reset(); CharTermAttribute charTermAttrib = wikiTokenizer.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = wikiTokenizer.getAttribute(TypeAttribute.class); OffsetAttribute offset = wikiTokenizer.getAttribute(OffsetAttribute.class); List<String> tokens = new ArrayList<String>(); wikiTokenizer.reset(); while (wikiTokenizer.incrementToken()) { tokens.add(charTermAttrib.toString()); System.out.println(typeAtt.type() + " (" + offset.startOffset() + "," + offset.endOffset() + "): " + charTermAttrib.toString()); } wikiTokenizer.end(); wikiTokenizer.close(); } catch(Exception e){ logger.error("Exception while tokenizing Wiki Text: " + e.getMessage()); e.printStackTrace(); } }