public void testHyphenationCompoundWordsDA() throws Exception { CharArraySet dict = makeDictionary("læse", "hest"); InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter .getHyphenationTree(is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); assertTokenStreamContents(tf, new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 } ); }
public void testHyphenationCompoundWordsDELongestMatch() throws Exception { CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv"); InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter .getHyphenationTree(is); // the word basket will not be added due to the longest match option HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true); assertTokenStreamContents(tf, new String[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 } ); }
public void testHyphenationCompoundWordsDA() throws Exception { CharArraySet dict = makeDictionary("læse", "hest"); InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter .getHyphenationTree(is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); assertTokenStreamContents(tf, new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 } ); }
public void testHyphenationCompoundWordsDELongestMatch() throws Exception { CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv"); InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter .getHyphenationTree(is); // the word basket will not be added due to the longest match option HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true); assertTokenStreamContents(tf, new String[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 } ); }
/** * @deprecated Use {@link #HyphenationCompoundWordTokenFilter(TokenStream,HyphenationTree,CharArraySet)} */ @Deprecated public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary) { this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false); }
/** * @deprecated Use {@link #HyphenationCompoundWordTokenFilter(TokenStream,HyphenationTree,CharArraySet,int,int,int,boolean)} */ @Deprecated public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); this.hyphenator = hyphenator; }
/** * @deprecated Use {@link #HyphenationCompoundWordTokenFilter(TokenStream,HyphenationTree,int,int,int)} */ @Deprecated public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, int minWordSize, int minSubwordSize, int maxSubwordSize) { this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize, maxSubwordSize, false); }
/** * @deprecated Use {@link #HyphenationCompoundWordTokenFilter(TokenStream,HyphenationTree)} */ @Deprecated public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator) { this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE); }
@Override public Object create(Random random) { // TODO: make nastier try { InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = Lucene43HyphenationCompoundWordTokenFilter.getHyphenationTree(is); return hyphenator; } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code } }
@Override public Object create(Random random) { // TODO: make nastier try { InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); return hyphenator; } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code } }
/** * With hyphenation-only, you can get a lot of nonsense tokens. * This can be controlled with the min/max subword size. */ public void testHyphenationOnly() throws Exception { InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter .getHyphenationTree(is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4); // min=2, max=4 assertTokenStreamContents(tf, new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" } ); tf = new HyphenationCompoundWordTokenFilter( new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6); // min=4, max=6 assertTokenStreamContents(tf, new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" } ); tf = new HyphenationCompoundWordTokenFilter( new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10); // min=4, max=10 assertTokenStreamContents(tf, new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" } ); }
/** * With hyphenation-only, you can get a lot of nonsense tokens. * This can be controlled with the min/max subword size. */ public void testHyphenationOnly() throws Exception { InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter .getHyphenationTree(is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4); // min=2, max=4 assertTokenStreamContents(tf, new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" } ); tf = new HyphenationCompoundWordTokenFilter( TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6); // min=4, max=6 assertTokenStreamContents(tf, new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" } ); tf = new HyphenationCompoundWordTokenFilter( TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10); // min=4, max=10 assertTokenStreamContents(tf, new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" } ); }
/** * Creates a new {@link Lucene43HyphenationCompoundWordTokenFilter} instance. * * @param input * the {@link TokenStream} to process * @param hyphenator * the hyphenation pattern tree to use for hyphenation * @param dictionary * the word dictionary to match against. * @param minWordSize * only words longer than this get processed * @param minSubwordSize * only subwords longer than this get to the output stream * @param maxSubwordSize * only subwords shorter than this get to the output stream * @param onlyLongestMatch * Add only the longest matching subword to the stream */ public Lucene43HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); this.hyphenator = hyphenator; }
/** * Create a HyphenationCompoundWordTokenFilter with no dictionary. * <p> * Calls {@link #Lucene43HyphenationCompoundWordTokenFilter(TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean) * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, * null, minWordSize, minSubwordSize, maxSubwordSize } */ public Lucene43HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, int minWordSize, int minSubwordSize, int maxSubwordSize) { this(input, hyphenator, null, minWordSize, minSubwordSize, maxSubwordSize, false); }
/** * Create a hyphenator tree * * @param hyphenationSource the InputSource pointing to the XML grammar * @return An object representing the hyphenation patterns * @throws IOException If there is a low-level I/O error. */ public static HyphenationTree getHyphenationTree(InputSource hyphenationSource) throws IOException { HyphenationTree tree = new HyphenationTree(); tree.loadPatterns(hyphenationSource); return tree; }
/** * Creates a new {@link HyphenationCompoundWordTokenFilter} instance. * * @param input * the {@link TokenStream} to process * @param hyphenator * the hyphenation pattern tree to use for hyphenation * @param dictionary * the word dictionary to match against. * @param minWordSize * only words longer than this get processed * @param minSubwordSize * only subwords longer than this get to the output stream * @param maxSubwordSize * only subwords shorter than this get to the output stream * @param onlyLongestMatch * Add only the longest matching subword to the stream */ public HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); this.hyphenator = hyphenator; }
/** * Create a HyphenationCompoundWordTokenFilter with no dictionary. * <p> * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, org.apache.lucene.analysis.util.CharArraySet, int, int, int, boolean) * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, * null, minWordSize, minSubwordSize, maxSubwordSize } */ public HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, int minWordSize, int minSubwordSize, int maxSubwordSize) { this(input, hyphenator, null, minWordSize, minSubwordSize, maxSubwordSize, false); }
/** * Creates a new {@link HyphenationCompoundWordTokenFilter} instance. * * @param matchVersion * Lucene version to enable correct Unicode 4.0 behavior in the * dictionaries if Version > 3.0. See <a * href="CompoundWordTokenFilterBase.html#version" * >CompoundWordTokenFilterBase</a> for details. * @param input * the {@link TokenStream} to process * @param hyphenator * the hyphenation pattern tree to use for hyphenation * @param dictionary * the word dictionary to match against. * @param minWordSize * only words longer than this get processed * @param minSubwordSize * only subwords longer than this get to the output stream * @param maxSubwordSize * only subwords shorter than this get to the output stream * @param onlyLongestMatch * Add only the longest matching subword to the stream */ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); this.hyphenator = hyphenator; }
/** * Create a HyphenationCompoundWordTokenFilter with no dictionary. * <p> * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean) * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, * null, minWordSize, minSubwordSize, maxSubwordSize } */ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, int minWordSize, int minSubwordSize, int maxSubwordSize) { this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize, maxSubwordSize, false); }