/** test multiword offsets with the old impl * @deprecated Remove this test in Lucene 5.0 */ @Deprecated public void testMultiwordOffsetsOld() throws Exception { SynonymFilterFactory factory = new SynonymFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("synonyms", "synonyms.txt"); factory.setLuceneMatchVersion(Version.LUCENE_33); factory.init(args); factory.inform(new StringMockResourceLoader("national hockey league, nhl")); TokenStream ts = factory.create(new MockTokenizer(new StringReader("national hockey league"), MockTokenizer.WHITESPACE, false)); // WTF? assertTokenStreamContents(ts, new String[] { "national", "nhl", "hockey", "league" }, new int[] { 0, 0, 0, 0 }, new int[] { 22, 22, 22, 22 }, new int[] { 1, 0, 1, 1 }); }
public void testEncoder() throws Exception { Map<String,String> args = new HashMap<String, String>(); args.put(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR, "float"); DelimitedPayloadTokenFilterFactory factory = new DelimitedPayloadTokenFilterFactory(); factory.init(args); ResourceLoader loader = new StringMockResourceLoader("solr/collection1"); factory.inform(loader); TokenStream input = new MockTokenizer(new StringReader("the|0.1 quick|0.1 red|0.1"), MockTokenizer.WHITESPACE, false); DelimitedPayloadTokenFilter tf = factory.create(input); tf.reset(); while (tf.incrementToken()){ PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class); assertTrue("payAttr is null and it shouldn't be", payAttr != null); byte[] payData = payAttr.getPayload().bytes; assertTrue("payData is null and it shouldn't be", payData != null); assertTrue("payData is null and it shouldn't be", payData != null); float payFloat = PayloadHelper.decodeFloat(payData); assertTrue(payFloat + " does not equal: " + 0.1f, payFloat == 0.1f); } }
public void testDelim() throws Exception { Map<String,String> args = new HashMap<String, String>(); args.put(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR, FloatEncoder.class.getName()); args.put(DelimitedPayloadTokenFilterFactory.DELIMITER_ATTR, "*"); DelimitedPayloadTokenFilterFactory factory = new DelimitedPayloadTokenFilterFactory(); factory.init(args); ResourceLoader loader = new StringMockResourceLoader("solr/collection1"); factory.inform(loader); TokenStream input = new MockTokenizer(new StringReader("the*0.1 quick*0.1 red*0.1"), MockTokenizer.WHITESPACE, false); DelimitedPayloadTokenFilter tf = factory.create(input); tf.reset(); while (tf.incrementToken()){ PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class); assertTrue("payAttr is null and it shouldn't be", payAttr != null); byte[] payData = payAttr.getPayload().bytes; assertTrue("payData is null and it shouldn't be", payData != null); float payFloat = PayloadHelper.decodeFloat(payData); assertTrue(payFloat + " does not equal: " + 0.1f, payFloat == 0.1f); } }
public void testNormalization() throws IOException { String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; CollationKeyFilterFactory factory = new CollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("language", "tr"); args.put("strength", "primary"); args.put("decomposition", "canonical"); factory.init(args); factory.inform(new StringMockResourceLoader("")); TokenStream tsUpper = factory.create( new MockTokenizer(new StringReader(turkishUpperCase), MockTokenizer.KEYWORD, false)); TokenStream tsLower = factory.create( new MockTokenizer(new StringReader(turkishLowerCase), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsUpper, tsLower); }
public void testFullDecomposition() throws IOException { String fullWidth = "Testing"; String halfWidth = "Testing"; CollationKeyFilterFactory factory = new CollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("language", "zh"); args.put("strength", "identical"); args.put("decomposition", "full"); factory.init(args); factory.inform(new StringMockResourceLoader("")); TokenStream tsFull = factory.create( new MockTokenizer(new StringReader(fullWidth), MockTokenizer.KEYWORD, false)); TokenStream tsHalf = factory.create( new MockTokenizer(new StringReader(halfWidth), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsFull, tsHalf); }
public void testSecondaryStrength() throws IOException { String upperCase = "TESTING"; String lowerCase = "testing"; CollationKeyFilterFactory factory = new CollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("language", "en"); args.put("strength", "secondary"); args.put("decomposition", "no"); factory.init(args); factory.inform(new StringMockResourceLoader("")); TokenStream tsUpper = factory.create( new MockTokenizer(new StringReader(upperCase), MockTokenizer.KEYWORD, false)); TokenStream tsLower = factory.create( new MockTokenizer(new StringReader(lowerCase), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsUpper, tsLower); }
public void testMultiWordSynonyms() throws Exception { Reader reader = new StringReader("a e"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("Synonym", TEST_VERSION_CURRENT, new StringMockResourceLoader("a b c,d"), "synonyms", "synonyms.txt").create(stream); // This fails because ["e","e"] is the value of the token stream assertTokenStreamContents(stream, new String[] { "a", "e" }); }
/** test multiword offsets with the old impl * @deprecated Remove this test in Lucene 5.0 */ @Deprecated public void testMultiwordOffsetsOld() throws Exception { Reader reader = new StringReader("national hockey league"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("Synonym", Version.LUCENE_3_3, new StringMockResourceLoader("national hockey league, nhl"), "synonyms", "synonyms.txt").create(stream); // WTF? assertTokenStreamContents(stream, new String[] { "national", "nhl", "hockey", "league" }, new int[] { 0, 0, 0, 0 }, new int[] { 22, 22, 22, 22 }, new int[] { 1, 0, 1, 1 }); }
/** if the synonyms are completely empty, test that we still analyze correctly */ public void testEmptySynonyms() throws Exception { Reader reader = new StringReader("GB"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("Synonym", TEST_VERSION_CURRENT, new StringMockResourceLoader(""), // empty file! "synonyms", "synonyms.txt").create(stream); assertTokenStreamContents(stream, new String[] { "GB" }); }
public void testKeywords() throws Exception { // our stemdict stems dogs to 'cat' Reader reader = new StringReader("testing dogs"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("StemmerOverride", TEST_VERSION_CURRENT, new StringMockResourceLoader("dogs\tcat"), "dictionary", "stemdict.txt").create(stream); stream = tokenFilterFactory("PorterStem").create(stream); assertTokenStreamContents(stream, new String[] { "test", "cat" }); }
public void testKeywordsCaseInsensitive() throws Exception { Reader reader = new StringReader("testing DoGs"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("StemmerOverride", TEST_VERSION_CURRENT, new StringMockResourceLoader("dogs\tcat"), "dictionary", "stemdict.txt", "ignoreCase", "true").create(stream); stream = tokenFilterFactory("PorterStem").create(stream); assertTokenStreamContents(stream, new String[] { "test", "cat" }); }
public void testKeywords() throws Exception { Reader reader = new StringReader("dogs cats"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("KeywordMarker", TEST_VERSION_CURRENT, new StringMockResourceLoader("cats"), "protected", "protwords.txt").create(stream); stream = tokenFilterFactory("PorterStem").create(stream); assertTokenStreamContents(stream, new String[] { "dog", "cats" }); }
public void testKeywordsMixed() throws Exception { Reader reader = new StringReader("dogs cats birds"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("KeywordMarker", TEST_VERSION_CURRENT, new StringMockResourceLoader("cats"), "protected", "protwords.txt", "pattern", "birds|Dogs").create(stream); stream = tokenFilterFactory("PorterStem").create(stream); assertTokenStreamContents(stream, new String[] { "dog", "cats", "birds" }); }
public void testKeywordsCaseInsensitive() throws Exception { Reader reader = new StringReader("dogs cats Cats"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("KeywordMarker", TEST_VERSION_CURRENT, new StringMockResourceLoader("cats"), "protected", "protwords.txt", "ignoreCase", "true").create(stream); stream = tokenFilterFactory("PorterStem").create(stream); assertTokenStreamContents(stream, new String[] { "dog", "cats", "Cats" }); }
public void testKeywordsCaseInsensitiveMixed() throws Exception { Reader reader = new StringReader("dogs cats Cats Birds birds"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("KeywordMarker", TEST_VERSION_CURRENT, new StringMockResourceLoader("cats"), "protected", "protwords.txt", "pattern", "birds", "ignoreCase", "true").create(stream); stream = tokenFilterFactory("PorterStem").create(stream); assertTokenStreamContents(stream, new String[] { "dog", "cats", "Cats", "Birds", "birds" }); }
/** * Test the protected words mechanism of SnowballPorterFilterFactory */ public void testProtected() throws Exception { Reader reader = new StringReader("ridding of some stemming"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("SnowballPorter", TEST_VERSION_CURRENT, new StringMockResourceLoader("ridding"), "protected", "protwords.txt", "language", "English").create(stream); assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" }); }
public void testCustomRules() throws Exception { RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new Locale("de", "DE")); String DIN5007_2_tailorings = "& ae , a\u0308 & AE , A\u0308"+ "& oe , o\u0308 & OE , O\u0308"+ "& ue , u\u0308 & UE , u\u0308"; RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); String tailoredRules = tailoredCollator.getRules(); // // at this point, you would save these tailoredRules to a file, // and use the custom parameter. // String germanUmlaut = "Töne"; String germanOE = "Toene"; Map<String,String> args = new HashMap<>(); args.put("custom", "rules.txt"); args.put("strength", "primary"); CollationKeyFilterFactory factory = new CollationKeyFilterFactory(args); factory.inform(new StringMockResourceLoader(tailoredRules)); TokenStream tsUmlaut = factory.create( new MockTokenizer(new StringReader(germanUmlaut), MockTokenizer.KEYWORD, false)); TokenStream tsOE = factory.create( new MockTokenizer(new StringReader(germanOE), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsUmlaut, tsOE); }
public void testMultiWordSynonyms() throws IOException { SynonymFilterFactory factory = new SynonymFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("synonyms", "synonyms.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(new StringMockResourceLoader("a b c,d")); TokenStream ts = factory.create(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false)); // This fails because ["e","e"] is the value of the token stream assertTokenStreamContents(ts, new String[] { "a", "e" }); }
/** if the synonyms are completely empty, test that we still analyze correctly */ public void testEmptySynonyms() throws Exception { SynonymFilterFactory factory = new SynonymFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("synonyms", "synonyms.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(new StringMockResourceLoader("")); // empty file! TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); assertTokenStreamContents(ts, new String[] { "GB" }); }
public void testKeywords() throws IOException { // our stemdict stems dogs to 'cat' Reader reader = new StringReader("testing dogs"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); Map<String,String> args = new HashMap<String,String>(); ResourceLoader loader = new StringMockResourceLoader("dogs\tcat"); args.put("dictionary", "stemdict.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); assertTokenStreamContents(ts, new String[] { "test", "cat" }); }
public void testKeywordsCaseInsensitive() throws IOException { Reader reader = new StringReader("testing DoGs"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); Map<String,String> args = new HashMap<String,String>(); ResourceLoader loader = new StringMockResourceLoader("dogs\tcat"); args.put("dictionary", "stemdict.txt"); args.put("ignoreCase", "true"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); assertTokenStreamContents(ts, new String[] { "test", "cat" }); }
public void testKeywords() throws IOException { Reader reader = new StringReader("dogs cats"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); Map<String,String> args = new HashMap<String,String>(); ResourceLoader loader = new StringMockResourceLoader("cats"); args.put("protected", "protwords.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); assertTokenStreamContents(ts, new String[] { "dog", "cats" }); }
public void testKeywordsCaseInsensitive() throws IOException { Reader reader = new StringReader("dogs cats Cats"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); Map<String,String> args = new HashMap<String,String>(); ResourceLoader loader = new StringMockResourceLoader("cats"); args.put("protected", "protwords.txt"); args.put("ignoreCase", "true"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" }); }
/** * Test the protected words mechanism of SnowballPorterFilterFactory */ public void testProtected() throws Exception { SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); ResourceLoader loader = new StringMockResourceLoader("ridding"); Map<String,String> args = new HashMap<String,String>(); args.put("protected", "protwords.txt"); args.put("language", "English"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); Reader reader = new StringReader("ridding of some stemming"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" }); }
public void testBasicUsage() throws IOException { String turkishUpperCase = "I WİLL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; CollationKeyFilterFactory factory = new CollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("language", "tr"); args.put("strength", "primary"); factory.init(args); factory.inform(new StringMockResourceLoader("")); TokenStream tsUpper = factory.create( new MockTokenizer(new StringReader(turkishUpperCase), MockTokenizer.KEYWORD, false)); TokenStream tsLower = factory.create( new MockTokenizer(new StringReader(turkishLowerCase), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsUpper, tsLower); }
public void testCustomRules() throws Exception { RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new Locale("de", "DE")); String DIN5007_2_tailorings = "& ae , a\u0308 & AE , A\u0308"+ "& oe , o\u0308 & OE , O\u0308"+ "& ue , u\u0308 & UE , u\u0308"; RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); String tailoredRules = tailoredCollator.getRules(); // // at this point, you would save these tailoredRules to a file, // and use the custom parameter. // String germanUmlaut = "Töne"; String germanOE = "Toene"; CollationKeyFilterFactory factory = new CollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("custom", "rules.txt"); args.put("strength", "primary"); factory.init(args); factory.inform(new StringMockResourceLoader(tailoredRules)); TokenStream tsUmlaut = factory.create( new MockTokenizer(new StringReader(germanUmlaut), MockTokenizer.KEYWORD, false)); TokenStream tsOE = factory.create( new MockTokenizer(new StringReader(germanOE), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsUmlaut, tsOE); }
/** test multiword offsets with the old impl * @deprecated Remove this test in Lucene 5.0 */ @Deprecated public void testMultiwordOffsetsOld() throws Exception { Reader reader = new StringReader("national hockey league"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("Synonym", Version.LUCENE_33, new StringMockResourceLoader("national hockey league, nhl"), "synonyms", "synonyms.txt").create(stream); // WTF? assertTokenStreamContents(stream, new String[] { "national", "nhl", "hockey", "league" }, new int[] { 0, 0, 0, 0 }, new int[] { 22, 22, 22, 22 }, new int[] { 1, 0, 1, 1 }); }
public void testCustomRules() throws Exception { RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new Locale("de", "DE")); String DIN5007_2_tailorings = "& ae , a\u0308 & AE , A\u0308"+ "& oe , o\u0308 & OE , O\u0308"+ "& ue , u\u0308 & UE , u\u0308"; RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); String tailoredRules = tailoredCollator.getRules(); // // at this point, you would save these tailoredRules to a file, // and use the custom parameter. // String germanUmlaut = "Töne"; String germanOE = "Toene"; Map<String,String> args = new HashMap<String,String>(); args.put("custom", "rules.txt"); args.put("strength", "primary"); CollationKeyFilterFactory factory = new CollationKeyFilterFactory(args); factory.inform(new StringMockResourceLoader(tailoredRules)); TokenStream tsUmlaut = factory.create( new MockTokenizer(new StringReader(germanUmlaut), MockTokenizer.KEYWORD, false)); TokenStream tsOE = factory.create( new MockTokenizer(new StringReader(germanOE), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsUmlaut, tsOE); }