private CharArrayMap convertPhraseSet( CharArraySet phraseSet ) { CharArrayMap<CharArraySet> phraseMap = new CharArrayMap( 100, false); Iterator<Object> phraseIt = phraseSet.iterator( ); while (phraseIt != null && phraseIt.hasNext() ) { char[] phrase = (char[])phraseIt.next(); Log.debug( "'" + new String( phrase ) + "'" ); char[] firstTerm = getFirstTerm( phrase ); Log.debug( "'" + new String( firstTerm ) + "'" ); CharArraySet itsPhrases = phraseMap.get( firstTerm, 0, firstTerm.length ); if (itsPhrases == null) { itsPhrases = new CharArraySet( 5, false ); phraseMap.put( new String( firstTerm ), itsPhrases ); } itsPhrases.add( phrase ); } return phraseMap; }
/** * Reads the affix file through the provided InputStream, building up the prefix and suffix maps * * @param affixStream InputStream to read the content of the affix file from * @param decoder CharsetDecoder to decode the content of the file * @throws IOException Can be thrown while reading from the InputStream */ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException { prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase); suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase); LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); String line = null; while ((line = reader.readLine()) != null) { if (line.startsWith(ALIAS_KEY)) { parseAlias(line); } else if (line.startsWith(PREFIX_KEY)) { parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict); } else if (line.startsWith(SUFFIX_KEY)) { parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict); } else if (line.startsWith(FLAG_KEY)) { // Assume that the FLAG line comes before any prefix or suffixes // Store the strategy so it can be used when parsing the dic file flagParsingStrategy = getFlagParsingStrategy(line); } } }
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); if (stemOverrideDict.isEmpty() || !matchVersion.onOrAfter(Version.LUCENE_31)) { this.stemdict = null; this.origStemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap<String>.EntryIterator iter = stemOverrideDict.entrySet().iterator(); CharsRef spare = new CharsRef(); while (iter.hasNext()) { char[] nextKey = iter.nextKey(); spare.copyChars(nextKey, 0, nextKey.length); builder.add(spare, iter.currentValue()); } try { this.stemdict = builder.build(); } catch (IOException ex) { throw new RuntimeException("can not build stem dict", ex); } } }
/** * @deprecated Use {@link #DutchAnalyzer(CharArraySet)} */ @Deprecated public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){ // historically, this ctor never the stem dict!!!!! // so we populate it only for >= 3.6 this(matchVersion, stopwords, CharArraySet.EMPTY_SET, matchVersion.onOrAfter(Version.LUCENE_3_6) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap.<String>emptyMap()); }
/** * @deprecated Use {@link #DutchAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){ // historically, this ctor never the stem dict!!!!! // so we populate it only for >= 3.6 this(matchVersion, stopwords, stemExclusionTable, matchVersion.onOrAfter(Version.LUCENE_3_6) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap.<String>emptyMap()); }
/** * @deprecated Use {@link #DutchAnalyzer(CharArraySet,CharArraySet,CharArrayMap)} */ @Deprecated public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) { setVersion(matchVersion); this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); if (stemOverrideDict.isEmpty() || !matchVersion.onOrAfter(Version.LUCENE_3_1)) { this.stemdict = null; this.origStemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap<String>.EntryIterator iter = stemOverrideDict.entrySet().iterator(); CharsRefBuilder spare = new CharsRefBuilder(); while (iter.hasNext()) { char[] nextKey = iter.nextKey(); spare.copyChars(nextKey, 0, nextKey.length); builder.add(spare.get(), iter.currentValue()); } try { this.stemdict = builder.build(); } catch (IOException ex) { throw new RuntimeException("can not build stem dict", ex); } } }
/** * @param singleMatch List<String>, the sequence of strings to match * @param replacement List<Token> the list of tokens to use on a match * @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens * @param mergeExisting merge the replacement tokens with any other mappings that exist */ public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) { SlowSynonymMap currMap = this; for (String str : singleMatch) { if (currMap.submap==null) { // for now hardcode at 4.0, as its what the old code did. // would be nice to fix, but shouldn't store a version in each submap!!! currMap.submap = new CharArrayMap<>(Version.LUCENE_CURRENT, 1, ignoreCase()); } SlowSynonymMap map = currMap.submap.get(str); if (map==null) { map = new SlowSynonymMap(); map.flags |= flags & IGNORE_CASE; currMap.submap.put(str, map); } currMap = map; } if (currMap.synonyms != null && !mergeExisting) { throw new IllegalArgumentException("SynonymFilter: there is already a mapping for " + singleMatch); } List<Token> superset = currMap.synonyms==null ? replacement : mergeTokens(Arrays.asList(currMap.synonyms), replacement); currMap.synonyms = superset.toArray(new Token[superset.size()]); if (includeOrig) currMap.flags |= INCLUDE_ORIG; }
/** * Inizializza l'analizzatore sintattico per lingua * * @param language lingua * @param stopwords stop words * @param stemExclusionSet elenco dei termini che non deve essere sottoposto * a stemming * @param stemOverrideDict dizionario dei termini in overriding */ public MyAnalyzer(String language, CharArraySet stopwords, CharArraySet stemExclusionSet, CharArrayMap<String> stemOverrideDict) { super(stopwords); this.language = language; this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemTable = DefaultSetHolder.DEFAULT_TABLE; if (stemOverrideDict.isEmpty()) { this.stemdict = null; } else { Builder builder = new Builder(false); EntryIterator iter = stemOverrideDict.entrySet().iterator(); CharsRefBuilder spare = new CharsRefBuilder(); while (iter.hasNext()) { char[] ex = iter.nextKey(); spare.copyChars(ex, 0, ex.length); builder.add(spare.get(), (CharSequence) iter.currentValue()); } try { this.stemdict = builder.build(); } catch (IOException var8) { throw new RuntimeException("can not build stem dict", var8); } } }
public void doRandom(int iter, boolean ignoreCase) { CharArrayMap<Integer> map = new CharArrayMap<>(1, ignoreCase); HashMap<String,Integer> hmap = new HashMap<>(); char[] key; for (int i=0; i<iter; i++) { int len = random().nextInt(5); key = new char[len]; for (int j=0; j<key.length; j++) { key[j] = (char)random().nextInt(127); } String keyStr = new String(key); String hmapKey = ignoreCase ? keyStr.toLowerCase(Locale.ROOT) : keyStr; int val = random().nextInt(); Object o1 = map.put(key, val); Object o2 = hmap.put(hmapKey,val); assertEquals(o1,o2); // add it again with the string method assertEquals(val, map.put(keyStr,val).intValue()); assertEquals(val, map.get(key,0,key.length).intValue()); assertEquals(val, map.get(key).intValue()); assertEquals(val, map.get(keyStr).intValue()); assertEquals(hmap.size(), map.size()); } }
public void testToString() { CharArrayMap<Integer> cm = new CharArrayMap<>(Collections.singletonMap("test",1), false); assertEquals("[test]",cm.keySet().toString()); assertEquals("[1]",cm.values().toString()); assertEquals("[test=1]",cm.entrySet().toString()); assertEquals("{test=1}",cm.toString()); cm.put("test2", 2); assertTrue(cm.keySet().toString().contains(", ")); assertTrue(cm.values().toString().contains(", ")); assertTrue(cm.entrySet().toString().contains(", ")); assertTrue(cm.toString().contains(", ")); }
@Override public Object create(Random random) { int num = random.nextInt(10); CharArrayMap<String> map = new CharArrayMap<>(num, random.nextBoolean()); for (int i = 0; i < num; i++) { // TODO: make nastier map.put(TestUtil.randomSimpleString(random), TestUtil.randomSimpleString(random)); } return map; }
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){ // historically, this ctor never the stem dict!!!!! // so we populate it only for >= 3.6 this(matchVersion, stopwords, CharArraySet.EMPTY_SET, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap.<String>emptyMap()); }
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){ // historically, this ctor never the stem dict!!!!! // so we populate it only for >= 3.6 this(matchVersion, stopwords, stemExclusionTable, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap.<String>emptyMap()); }
/** * @param singleMatch List<String>, the sequence of strings to match * @param replacement List<Token> the list of tokens to use on a match * @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens * @param mergeExisting merge the replacement tokens with any other mappings that exist */ public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) { SlowSynonymMap currMap = this; for (String str : singleMatch) { if (currMap.submap==null) { // for now hardcode at 4.0, as its what the old code did. // would be nice to fix, but shouldn't store a version in each submap!!! currMap.submap = new CharArrayMap<SlowSynonymMap>(Version.LUCENE_40, 1, ignoreCase()); } SlowSynonymMap map = currMap.submap.get(str); if (map==null) { map = new SlowSynonymMap(); map.flags |= flags & IGNORE_CASE; currMap.submap.put(str, map); } currMap = map; } if (currMap.synonyms != null && !mergeExisting) { throw new IllegalArgumentException("SynonymFilter: there is already a mapping for " + singleMatch); } List<Token> superset = currMap.synonyms==null ? replacement : mergeTokens(Arrays.asList(currMap.synonyms), replacement); currMap.synonyms = superset.toArray(new Token[superset.size()]); if (includeOrig) currMap.flags |= INCLUDE_ORIG; }
public void doRandom(int iter, boolean ignoreCase) { CharArrayMap<Integer> map = new CharArrayMap<Integer>(TEST_VERSION_CURRENT, 1, ignoreCase); HashMap<String,Integer> hmap = new HashMap<String,Integer>(); char[] key; for (int i=0; i<iter; i++) { int len = random().nextInt(5); key = new char[len]; for (int j=0; j<key.length; j++) { key[j] = (char)random().nextInt(127); } String keyStr = new String(key); String hmapKey = ignoreCase ? keyStr.toLowerCase(Locale.ROOT) : keyStr; int val = random().nextInt(); Object o1 = map.put(key, val); Object o2 = hmap.put(hmapKey,val); assertEquals(o1,o2); // add it again with the string method assertEquals(val, map.put(keyStr,val).intValue()); assertEquals(val, map.get(key,0,key.length).intValue()); assertEquals(val, map.get(key).intValue()); assertEquals(val, map.get(keyStr).intValue()); assertEquals(hmap.size(), map.size()); } }
public void testToString() { CharArrayMap<Integer> cm = new CharArrayMap<Integer>(TEST_VERSION_CURRENT, Collections.singletonMap("test",1), false); assertEquals("[test]",cm.keySet().toString()); assertEquals("[1]",cm.values().toString()); assertEquals("[test=1]",cm.entrySet().toString()); assertEquals("{test=1}",cm.toString()); cm.put("test2", 2); assertTrue(cm.keySet().toString().contains(", ")); assertTrue(cm.values().toString().contains(", ")); assertTrue(cm.entrySet().toString().contains(", ")); assertTrue(cm.toString().contains(", ")); }
public void testOverride() throws IOException { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. CharArrayMap<String> dictionary = new CharArrayMap<String>(TEST_VERSION_CURRENT, 1, false); dictionary.put("booked", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked")); TokenStream stream = new PorterStemFilter( new StemmerOverrideFilter(tokenizer, dictionary)); assertTokenStreamContents(stream, new String[] { "books" }); }
@Override public Object create(Random random) { int num = random.nextInt(10); CharArrayMap<String> map = new CharArrayMap<String>(TEST_VERSION_CURRENT, num, random.nextBoolean()); for (int i = 0; i < num; i++) { // TODO: make nastier map.put(_TestUtil.randomSimpleString(random), _TestUtil.randomSimpleString(random)); } return map; }
/** * @param singleMatch List<String>, the sequence of strings to match * @param replacement List<Token> the list of tokens to use on a match * @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens * @param mergeExisting merge the replacement tokens with any other mappings that exist */ public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) { SlowSynonymMap currMap = this; for (String str : singleMatch) { if (currMap.submap==null) { // for now hardcode at 4.0, as its what the old code did. // would be nice to fix, but shouldn't store a version in each submap!!! currMap.submap = new CharArrayMap<SlowSynonymMap>(Version.LUCENE_CURRENT, 1, ignoreCase()); } SlowSynonymMap map = currMap.submap.get(str); if (map==null) { map = new SlowSynonymMap(); map.flags |= flags & IGNORE_CASE; currMap.submap.put(str, map); } currMap = map; } if (currMap.synonyms != null && !mergeExisting) { throw new IllegalArgumentException("SynonymFilter: there is already a mapping for " + singleMatch); } List<Token> superset = currMap.synonyms==null ? replacement : mergeTokens(Arrays.asList(currMap.synonyms), replacement); currMap.synonyms = superset.toArray(new Token[superset.size()]); if (includeOrig) currMap.flags |= INCLUDE_ORIG; }