@Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) { return false; } final char[] buffer = termAtt.buffer(); final int bufferLength = termAtt.length(); if (bufferLength >= 2 && (buffer[bufferLength-2] == '\'' || (matchVersion.onOrAfter(Version.LUCENE_3_6) && (buffer[bufferLength-2] == '\u2019' || buffer[bufferLength-2] == '\uFF07'))) && (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { termAtt.setLength(bufferLength - 2); // Strip last 2 characters off } return true; }
public FbEntitySearcher(String indexDir, int numOfDocs, String searchingStrategy) throws IOException { LogInfo.begin_track("Constructing Searcher"); if (!searchingStrategy.equals("exact") && !searchingStrategy.equals("inexact")) throw new RuntimeException("Bad searching strategy: " + searchingStrategy); this.searchStrategy = searchingStrategy; queryParser = new QueryParser( Version.LUCENE_44, FbIndexField.TEXT.fieldName(), searchingStrategy.equals("exact") ? new KeywordAnalyzer() : new StandardAnalyzer(Version.LUCENE_44)); LogInfo.log("Opening index dir: " + indexDir); IndexReader indexReader = DirectoryReader.open(SimpleFSDirectory.open(new File(indexDir))); indexSearcher = new IndexSearcher(indexReader); LogInfo.log("Opened index with " + indexReader.numDocs() + " documents."); this.numOfDocs = numOfDocs; LogInfo.end_track(); }
/** * @deprecated Use {@link #parse(String, String[], BooleanClause.Occur[], Analyzer)} */ @Deprecated public static Query parse(Version matchVersion, String query, String[] fields, BooleanClause.Occur[] flags, Analyzer analyzer) throws ParseException { if (fields.length != flags.length) throw new IllegalArgumentException("fields.length != flags.length"); BooleanQuery bQuery = new BooleanQuery(); for (int i = 0; i < fields.length; i++) { QueryParser qp = new QueryParser(matchVersion, fields[i], analyzer); Query q = qp.parse(query); if (q!=null && // q never null, just being defensive (!(q instanceof BooleanQuery) || ((BooleanQuery)q).getClauses().length>0)) { bQuery.add(q, flags[i]); } } return bQuery; }
private static void checksumFromLuceneFile(Directory directory, String file, ImmutableMap.Builder<String, StoreFileMetaData> builder, ESLogger logger, Version version, boolean readFileAsHash) throws IOException { final String checksum; final BytesRefBuilder fileHash = new BytesRefBuilder(); try (final IndexInput in = directory.openInput(file, IOContext.READONCE)) { final long length; try { length = in.length(); if (length < CodecUtil.footerLength()) { // truncated files trigger IAE if we seek negative... these files are really corrupted though throw new CorruptIndexException("Can't retrieve checksum from file: " + file + " file length must be >= " + CodecUtil.footerLength() + " but was: " + in.length(), in); } if (readFileAsHash) { final VerifyingIndexInput verifyingIndexInput = new VerifyingIndexInput(in); // additional safety we checksum the entire file we read the hash for... hashFile(fileHash, new InputStreamIndexInput(verifyingIndexInput, length), length); checksum = digestToString(verifyingIndexInput.verify()); } else { checksum = digestToString(CodecUtil.retrieveChecksum(in)); } } catch (Throwable ex) { logger.debug("Can retrieve checksum from file [{}]", ex, file); throw ex; } builder.put(file, new StoreFileMetaData(file, length, checksum, version, fileHash.get())); } }
@Override public void inform(ResourceLoader loader) throws IOException { InputStream stream = null; try { if (dictFile != null) // the dictionary can be empty. dictionary = getWordSet(loader, dictFile, false); // TODO: Broken, because we cannot resolve real system id // ResourceLoader should also supply method like ClassLoader to get resource URL stream = loader.openResource(hypFile); final InputSource is = new InputSource(stream); is.setEncoding(encoding); // if it's null let xml parser decide is.setSystemId(hypFile); if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4_0)) { hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); } else { hyphenator = Lucene43HyphenationCompoundWordTokenFilter.getHyphenationTree(is); } } finally { IOUtils.closeWhileHandlingException(stream); } }
private void init(Version version, int minGram, int maxGram, boolean edgesOnly) { if (!version.onOrAfter(Version.LUCENE_4_4_0)) { throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } charUtils = version.onOrAfter(Version.LUCENE_4_4_0) ? CharacterUtils.getInstance(version) : CharacterUtils.getJava4Instance(); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.getBuffer().length]; // Make the term att large enough termAtt.resizeBuffer(2 * maxGram); }
@Inject public KeepWordFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); final String[] arrayKeepWords = settings.getAsArray(KEEP_WORDS_KEY, null); final String keepWordsPath = settings.get(KEEP_WORDS_PATH_KEY, null); if ((arrayKeepWords == null && keepWordsPath == null) || (arrayKeepWords != null && keepWordsPath != null)) { // we don't allow both or none throw new IllegalArgumentException("keep requires either `" + KEEP_WORDS_KEY + "` or `" + KEEP_WORDS_PATH_KEY + "` to be configured"); } if (version.onOrAfter(Version.LUCENE_4_4) && settings.get(ENABLE_POS_INC_KEY) != null) { throw new IllegalArgumentException(ENABLE_POS_INC_KEY + " is not supported anymore. Please fix your analysis chain or use" + " an older compatibility version (<=4.3) but beware that it might cause highlighting bugs."); } enablePositionIncrements = version.onOrAfter(Version.LUCENE_4_4) ? true : settings.getAsBoolean(ENABLE_POS_INC_KEY, true); this.keepWords = Analysis.getWordSet(env, settings, KEEP_WORDS_KEY); }
/** * @deprecated Use {@link #PortugueseAnalyzer(CharArraySet, CharArraySet)} */ @Deprecated public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
public static Version parseAnalysisVersion(Settings indexSettings, Settings settings, Logger logger) { // check for explicit version on the specific analyzer component String sVersion = settings.get("version"); if (sVersion != null) { return Lucene.parseVersion(sVersion, Version.LATEST, logger); } // check for explicit version on the index itself as default for all analysis components sVersion = indexSettings.get("index.analysis.version"); if (sVersion != null) { return Lucene.parseVersion(sVersion, Version.LATEST, logger); } // resolve the analysis version based on the version the index was created with return org.elasticsearch.Version.indexCreated(indexSettings).luceneVersion; }
public static CharArraySet getWordSet(Environment env, org.elasticsearch.Version indexCreatedVersion, Settings settings, String settingsPrefix) { List<String> wordList = getWordList(env, settings, settingsPrefix); if (wordList == null) { return null; } boolean ignoreCase = settings.getAsBooleanLenientForPreEs6Indices(indexCreatedVersion, settingsPrefix + "_case", false, deprecationLogger); return new CharArraySet(wordList, ignoreCase); }
public StoreFileMetaData(String name, long length, String checksum, Version writtenBy, BytesRef hash) { // its possible here to have a _na_ checksum or an unsupported writtenBy version, if the // file is a segments_N file, but that is fine in the case of a segments_N file because // we handle that case upstream assert name.startsWith("segments_") || (writtenBy != null && writtenBy.onOrAfter(FIRST_LUCENE_CHECKSUM_VERSION)) : "index version less that " + FIRST_LUCENE_CHECKSUM_VERSION + " are not supported but got: " + writtenBy; this.name = Objects.requireNonNull(name, "name must not be null"); this.length = length; this.checksum = Objects.requireNonNull(checksum, "checksum must not be null"); this.writtenBy = Objects.requireNonNull(writtenBy, "writtenBy must not be null"); this.hash = hash == null ? new BytesRef() : hash; }
@Override public TokenStream create(TokenStream tokenStream) { if (version.onOrAfter(Version.LUCENE_4_4_0)) { return new DictionaryCompoundWordTokenFilter(tokenStream, wordList, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } else { return new Lucene43DictionaryCompoundWordTokenFilter(tokenStream, wordList, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } }
public void testToFromXContent() throws IOException { final int iters = scaledRandomIntBetween(1, 10); for (int iter = 0; iter < iters; iter++) { final BytesRef hash = new BytesRef(scaledRandomIntBetween(0, 1024 * 1024)); hash.length = hash.bytes.length; for (int i = 0; i < hash.length; i++) { hash.bytes[i] = randomByte(); } StoreFileMetaData meta = new StoreFileMetaData("foobar", Math.abs(randomLong()), randomAsciiOfLengthBetween(1, 10), Version.LATEST, hash); ByteSizeValue size = new ByteSizeValue(Math.abs(randomLong())); BlobStoreIndexShardSnapshot.FileInfo info = new BlobStoreIndexShardSnapshot.FileInfo("_foobar", meta, size); XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON).prettyPrint(); BlobStoreIndexShardSnapshot.FileInfo.toXContent(info, builder, ToXContent.EMPTY_PARAMS); byte[] xcontent = BytesReference.toBytes(shuffleXContent(builder).bytes()); final BlobStoreIndexShardSnapshot.FileInfo parsedInfo; try (XContentParser parser = createParser(JsonXContent.jsonXContent, xcontent)) { parser.nextToken(); parsedInfo = BlobStoreIndexShardSnapshot.FileInfo.fromXContent(parser); } assertThat(info.name(), equalTo(parsedInfo.name())); assertThat(info.physicalName(), equalTo(parsedInfo.physicalName())); assertThat(info.length(), equalTo(parsedInfo.length())); assertThat(info.checksum(), equalTo(parsedInfo.checksum())); assertThat(info.partSize(), equalTo(parsedInfo.partSize())); assertThat(parsedInfo.metadata().hash().length, equalTo(hash.length)); assertThat(parsedInfo.metadata().hash(), equalTo(hash)); assertThat(parsedInfo.metadata().writtenBy(), equalTo(Version.LATEST)); assertThat(parsedInfo.isSame(info.metadata()), is(true)); } }
public void testStoreStats() throws IOException { final ShardId shardId = new ShardId("index", "_na_", 1); DirectoryService directoryService = new LuceneManagedDirectoryService(random()); Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT) .put(Store.INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING.getKey(), TimeValue.timeValueMinutes(0)).build(); Store store = new Store(shardId, IndexSettingsModule.newIndexSettings("index", settings), directoryService, new DummyShardLock(shardId)); long initialStoreSize = 0; for (String extraFiles : store.directory().listAll()) { assertTrue("expected extraFS file but got: " + extraFiles, extraFiles.startsWith("extra")); initialStoreSize += store.directory().fileLength(extraFiles); } StoreStats stats = store.stats(); assertEquals(stats.getSize().getBytes(), initialStoreSize); Directory dir = store.directory(); final long length; try (IndexOutput output = dir.createOutput("foo.bar", IOContext.DEFAULT)) { int iters = scaledRandomIntBetween(10, 100); for (int i = 0; i < iters; i++) { BytesRef bytesRef = new BytesRef(TestUtil.randomRealisticUnicodeString(random(), 10, 1024)); output.writeBytes(bytesRef.bytes, bytesRef.offset, bytesRef.length); } length = output.getFilePointer(); } assertTrue(numNonExtraFiles(store) > 0); stats = store.stats(); assertEquals(stats.getSizeInBytes(), length + initialStoreSize); deleteContent(store.directory()); IOUtils.close(store); }
@Override public TokenStream create(TokenStream tokenStream) { if (version.onOrAfter(Version.LUCENE_4_4)) { return new KeepWordFilter(tokenStream, keepWords); } else { @SuppressWarnings("deprecation") final TokenStream filter = new Lucene43KeepWordFilter(enablePositionIncrements, tokenStream, keepWords); return filter; } }
@Override public Tokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4_0)) { if (!EdgeNGramTokenFilter.Side.FRONT.getLabel().equals(side)) { throw new IllegalArgumentException(EdgeNGramTokenizer.class.getSimpleName() + " does not support backward n-grams as of Lucene 4.4"); } return new EdgeNGramTokenizer(input, minGramSize, maxGramSize); } else { return new Lucene43EdgeNGramTokenizer(luceneMatchVersion, input, side, minGramSize, maxGramSize); } }
public FbEntityIndexer(String namefile, String outputDir, String indexingStrategy) throws IOException { if (!indexingStrategy.equals("exact") && !indexingStrategy.equals("inexact")) throw new RuntimeException("Bad indexing strategy: " + indexingStrategy); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44 , indexingStrategy.equals("exact") ? new KeywordAnalyzer() : new StandardAnalyzer(Version.LUCENE_44)); config.setOpenMode(OpenMode.CREATE); config.setRAMBufferSizeMB(256.0); indexer = new IndexWriter(new SimpleFSDirectory(new File(outputDir)), config); this.nameFile = namefile; }
/** * 创建内存目录 */ public void createRAMDirectory() throws Exception { this.directory = new RAMDirectory(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, this.analyzer); IndexWriter indexWriter = new IndexWriter(this.directory, indexWriterConfig); indexWriter.close(); }
/** * Construct a new complete SegmentInfo instance from input. * <p>Note: this is public only to allow access from * the codecs package.</p> */ public SegmentInfo(Directory dir, Version version, String name, int docCount, boolean isCompoundFile, Codec codec, Map<String,String> diagnostics, Map<String,String> attributes) { assert !(dir instanceof TrackingDirectoryWrapper); this.dir = dir; this.version = version; this.name = name; this.docCount = docCount; this.isCompoundFile = isCompoundFile; this.codec = codec; this.diagnostics = diagnostics; this.attributes = attributes; }
@Override public TokenStream create(TokenStream input) { // if the dictionary is null, it means it was empty if (dictionary == null) { return input; } if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4_0)) { return new DictionaryCompoundWordTokenFilter(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } return new Lucene43DictionaryCompoundWordTokenFilter(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); }
NGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) { super(index, indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.matcher = parseTokenChars(settings.getAsArray("token_chars")); this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings); }
private static void setDiagnostics(SegmentInfo info, String source, Map<String,String> details) { Map<String,String> diagnostics = new HashMap<>(); diagnostics.put("source", source); diagnostics.put("lucene.version", Version.LATEST.toString()); diagnostics.put("os", Constants.OS_NAME); diagnostics.put("os.arch", Constants.OS_ARCH); diagnostics.put("os.version", Constants.OS_VERSION); diagnostics.put("java.version", Constants.JAVA_VERSION); diagnostics.put("java.vendor", Constants.JAVA_VENDOR); diagnostics.put("timestamp", Long.toString(new Date().getTime())); if (details != null) { diagnostics.putAll(details); } info.setDiagnostics(diagnostics); }
/** * @deprecated Use {@link #CharArrayMap(int, boolean)} */ @Deprecated @SuppressWarnings("unchecked") public CharArrayMap(Version matchVersion, int startSize, boolean ignoreCase) { this.ignoreCase = ignoreCase; int size = INIT_SIZE; while(startSize + (startSize>>2) > size) size <<= 1; keys = new char[size][]; values = (V[]) new Object[size]; this.charUtils = CharacterUtils.getInstance(matchVersion); this.matchVersion = matchVersion; }
/** * @deprecated Use {@link #DanishAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
/** * @deprecated Use {@link #HyphenationCompoundWordTokenFilter(TokenStream,HyphenationTree)} */ @Deprecated public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator) { this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE); }
/** * @deprecated Use {@link #NorwegianAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
/** * @deprecated Use {@link #FinnishAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
/** * @deprecated Use {@link #ArmenianAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
/** * @deprecated Use {@link #CharTokenizer(AttributeFactory, Reader)} */ @Deprecated public CharTokenizer(Version matchVersion, AttributeFactory factory, Reader input) { super(factory, input); charUtils = CharacterUtils.getInstance(matchVersion); }
/** * @deprecated Use {@link #ExtendableQueryParser(String, Analyzer, Extensions)} */ @Deprecated public ExtendableQueryParser(final Version matchVersion, final String f, final Analyzer a, final Extensions ext) { super(matchVersion, f, a); this.defaultField = f; this.extensions = ext; }
@SuppressWarnings("deprecation") @Override public TokenStream create(TokenStream tokenStream) { final Version version = this.version == Version.LUCENE_4_3 ? Version.LUCENE_4_4 : this.version; // we supported it since 4.3 if (version.onOrAfter(Version.LUCENE_4_3)) { return new NGramTokenFilter(tokenStream, minGram, maxGram); } else { return new Lucene43NGramTokenFilter(tokenStream, minGram, maxGram); } }
/** * @deprecated Use {@link #DutchAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){ // historically, this ctor never the stem dict!!!!! // so we populate it only for >= 3.6 this(matchVersion, stopwords, stemExclusionTable, matchVersion.onOrAfter(Version.LUCENE_3_6) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap.<String>emptyMap()); }
public StoreFileMetaData(String name, long length, String checksum, Version writtenBy, BytesRef hash) { this.name = name; this.length = length; this.checksum = checksum; this.writtenBy = writtenBy; this.hash = hash == null ? new BytesRef() : hash; }
@Inject public StopTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); this.ignoreCase = settings.getAsBoolean("ignore_case", false); this.removeTrailing = settings.getAsBoolean("remove_trailing", true); this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); if (version.onOrAfter(Version.LUCENE_4_4) && settings.get("enable_position_increments") != null) { throw new IllegalArgumentException("enable_position_increments is not supported anymore as of Lucene 4.4 as it can create broken token streams." + " Please fix your analysis chain or use an older compatibility version (<= 4.3)."); } this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true); }
/** * @deprecated Use {@link #EnglishAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
public static String lemmatize(String query) { StringBuilder sb = new StringBuilder(); ItalianAnalyzer analyzer = new ItalianAnalyzer(Version.LUCENE_44); TokenStream tokenStream; try { tokenStream = analyzer.tokenStream("label", query); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } sb.append(token.toString()); } analyzer.close(); } catch (IOException e) { log.error(e.getMessage(), e); sb = new StringBuilder(); sb.append(query); } return sb.toString(); }
/** * @deprecated Use {@link #FilteringTokenFilter(TokenStream)} */ @Deprecated public FilteringTokenFilter(Version version, TokenStream in) { super(in); this.version = version; this.enablePositionIncrements = true; }
private void initialize(File indexDir) { try { Stopwatch stopwatch = Stopwatch.createStarted(); indexSearcher = new IndexSearcher(IndexReader.open(FSDirectory.open(indexDir))); analyzer = new StandardAnalyzer(Version.LUCENE_36); log.info("Initialized lucene index at {} ({})", indexDir.getPath(), stopwatch.stop()); } catch (IOException e) { throw new RuntimeException("Unable to locate Lucene index.", e); } }
public final Version getLuceneMatchVersion() { return this.luceneMatchVersion; }
/** * @deprecated Use {@link #TypeTokenFilter(TokenStream,Set)} */ @Deprecated public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes) { this(version, input, stopTypes, false); }