我们从Python开源项目中,提取了以下22个代码示例,用于说明如何使用elasticsearch_dsl.Index()。
def index_job(link) : """ Index a single page. """ print("index page : %s"%link) # get final url after possible redictions try : link = url.crawl(link).url except : return 0 process = CrawlerProcess({ 'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36", 'DOWNLOAD_TIMEOUT':100, 'REDIRECT_ENABLED':False, 'SPIDER_MIDDLEWARES' : { 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True } }) process.crawl(crawler.SingleSpider, start_urls=[link,], es_client=client, redis_conn=redis_conn) process.start() # block until finished
def applyConfig(self): try: print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex)) self.es = connections.create_connection(hosts=[self.confESHost]) self.idx = Index(self.confESIndex) self.idx.doc_type(DocHTTPRequestResponse) if self.idx.exists(): self.idx.open() else: self.idx.create() self.callbacks.saveExtensionSetting("elasticburp.host", self.confESHost) self.callbacks.saveExtensionSetting("elasticburp.index", self.confESIndex) self.callbacks.saveExtensionSetting("elasticburp.tools", str(self.confBurpTools)) self.callbacks.saveExtensionSetting("elasticburp.onlyresp", str(int(self.confBurpOnlyResp))) except Exception as e: JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE) ### ITab ###
def execute(self): idx = es.Index(self.index) try: idx.delete(ignore=404) log.info("Deleted index %s", self.index) except AttributeError: log.warning("Could not delete index '%s', ignoring", self.index) except NotFoundError: log.warning("Could not delete index '%s', ignoring", self.index) # create doc types for dt in self.doc_types: idx.doc_type(dt) # create index idx.create()
def _synchronise_index(self, sql_table_cls, es_doc_cls, id_logger): es_doc = es_doc_cls() self._logging(logging.INFO, 'Synchronizing %s index.' % es_doc.index) with acquire_inter_process_lock('sync_%s' % es_doc.index) as acquired: if not acquired: es_doc = es_doc_cls() err_msg = 'Another process is already synchronizing the %s ' \ 'index, aborting now.' % es_doc.index self._logging(logging.WARNING, err_msg) else: self._perform_index_sync(sql_table_cls, es_doc_cls, id_logger) self._logging(logging.INFO, 'Index %s is now synchronized.' % es_doc.index)
def _perform_geocomplete_index_population(self, max_doc): elasticsearch_conn = connections.get_connection() to_index = list() for i, document in enumerate(self._geocompletion_documents()): if i % max_doc == 0: log_msg = 'Computing required geoloc-entry documents.' self._logging(logging.INFO, log_msg) to_index.append(document.to_dict(True)) if len(to_index) < max_doc: continue self._geocomplete_index_batch(elasticsearch_conn, to_index) to_index = list() if len(to_index) != 0: self._geocomplete_index_batch(elasticsearch_conn, to_index) elasticsearch_dsl.Index('geocomplete').refresh()
def _perform_index_purge(self, index_name, index_settings, doc_type_class): log_msg = 'Dropping %s index.' % index_name self._logging(logging.INFO, log_msg) index = elasticsearch_dsl.Index(index_name) index.settings(**index_settings) index.doc_type(doc_type_class) try: index.delete(ignore=404) index.create() except elasticsearch.exceptions.ElasticsearchException as e: log_msg = 'Error while dropping %s index: %s.' % (index_name, e) self._logging(logging.ERROR, log_msg) return log_msg = 'Index %s has been dropped successfully.' % index_name self._logging(logging.INFO, log_msg)
def index(): """ URL : /index Index a new URL in search engine. Method : POST Form data : - url : the url to index [string, required] Return a success message. """ # get POST data data = dict((key, request.form.get(key)) for key in request.form.keys()) if "url" not in data : raise InvalidUsage('No url specified in POST data') # launch exploration job index_job.delay(data["url"]) return "Indexing started"
def _create_index(self): dt = datetime.utcnow() dt = dt.strftime('%Y.%m') es = connections.get_connection() if not es.indices.exists('indicators-{}'.format(dt)): index = Index('indicators-{}'.format(dt)) index.aliases(live={}) index.doc_type(Indicator) index.create() m = Mapping('indicator') m.field('indicator_ipv4', 'ip') m.field('indicator_ipv4_mask', 'integer') m.save('indicators-{}'.format(dt)) return 'indicators-{}'.format(dt)
def mitm_request(self, data): # Initialize ES connection and index res = connections.create_connection(hosts=[args.elasticsearch]) idx = Index(args.index) idx.doc_type(DocHTTPRequestResponse) try: DocHTTPRequestResponse.init() idx.create() except: pass r = HTTPRequest(data) # determine url if self.is_connect: scheme = "https" else: scheme = "http" url = scheme + "://" + self.hostname if scheme == "http" and int(self.port) != 80 or scheme == "https" and int(self.port) != 443: url += ":" + str(self.port) url += self.path if args.verbose: print(url) self.doc = DocHTTPRequestResponse(host=self.hostname, port=int(self.port), protocol=scheme) self.doc.meta.index = args.index self.doc.request.url = url self.doc.request.requestline = r.requestline self.doc.request.method = r.command self.doc.host = self.hostname self.doc.port = int(self.port) self.doc.protocol = scheme return data
def createMenuItems(self, invocation): menuItems = list() selectedMsgs = invocation.getSelectedMessages() if selectedMsgs != None and len(selectedMsgs) >= 1: menuItems.append(JMenuItem("Add to ElasticSearch Index", actionPerformed=self.genAddToES(selectedMsgs, invocation.getInputEvent().getComponent()))) return menuItems
def migrate(): hidden_services = Index('hiddenservices') hidden_services.delete(ignore=404) hidden_services = Index('hiddenservices') hidden_services.doc_type(DomainDocType) hidden_services.doc_type(PageDocType) hidden_services.settings(number_of_shards=8, number_of_replicas=1) hidden_services.create()
def tearDown(self): index = Index(settings.ELASTICSEARCH_INDEX) index.delete(ignore=404)
def _index_img(self, img): """Index a single img and ensure that it's been propagated to the search engine""" image = search.db_image_to_index(img) image.save() index = Index(name=settings.ELASTICSEARCH_INDEX) index.flush(force=True) index.refresh()
def index_all_images(self, chunk_size=DEFAULT_CHUNK_SIZE, num_iterations=DEFAULT_NUM_ITERATIONS, num_threads=DEFAULT_NUM_THREADS): """Index every record in the database with a server-side cursor""" index = Index(settings.ELASTICSEARCH_INDEX) if not index.exists(): log.info("Creating new index %s", settings.ELASTICSEARCH_INDEX) search.Image.init() mapping = search.Image._doc_type.mapping mapping.save(settings.ELASTICSEARCH_INDEX) log.info("Done creating new index") with Pool(num_threads) as pool: starts = [i * chunk_size for i in range(0, num_iterations)] pool.starmap(do_index, zip(starts, itertools.repeat(chunk_size, len(starts))))
def test_aggregation_without_events(app, es_with_templates): """Check that the aggregation doesn't crash if there are no events. This scenario happens when celery starts aggregating but no events have been created yet. """ # Aggregate events StatAggregator(event='file-download', aggregation_field='file_id', aggregation_interval='day', query_modifiers=[]).run() assert not Index( 'stats-file-download', using=current_search_client ).exists() # Create the index but without any event. This happens when the events # have been indexed but are not yet searchable (before index refresh). Index('events-stats-file-download-2017', using=current_search_client).create() # Wait for the index to be available time.sleep(1) # Aggregate events StatAggregator(event='file-download', aggregation_field='file_id', aggregation_interval='day', query_modifiers=[]).run() assert not Index( 'stats-file-download', using=current_search_client ).exists()
def get_bookmark(self): """Get last aggregation date.""" if not Index(self.aggregation_alias, using=self.client).exists(): if not Index(self.event_index, using=self.client).exists(): return datetime.date.today() return self._get_oldest_event_timestamp() # retrieve the oldest bookmark query_bookmark = Search( using=self.client, index=self.aggregation_alias, doc_type='{0}-bookmark'.format(self.event) )[0:1].sort( {'date': {'order': 'desc'}} ) bookmarks = query_bookmark.execute() # if no bookmark is found but the index exist, the bookmark was somehow # lost or never written, so restart from the beginning if len(bookmarks) == 0: return self._get_oldest_event_timestamp() # change it to doc_id_suffix bookmark = datetime.datetime.strptime(bookmarks[0].date, self.doc_id_suffix) return bookmark
def run(self): """Calculate statistics aggregations.""" # If no events have been indexed there is nothing to aggregate if not Index(self.event_index, using=self.client).exists(): return lower_limit = self.get_bookmark() # Stop here if no bookmark could be estimated. if lower_limit is None: return upper_limit = min( datetime.datetime.utcnow(). replace(microsecond=0), datetime.datetime.combine(lower_limit + datetime.timedelta(self.batch_size), datetime.datetime.min.time()) ) while upper_limit <= datetime.datetime.utcnow(): self.indices = set() self.new_bookmark = upper_limit.strftime(self.doc_id_suffix) bulk(self.client, self.agg_iter(lower_limit, upper_limit), stats_only=True, chunk_size=50) # Flush all indices which have been modified current_search_client.indices.flush( index=','.join(self.indices), wait_if_ongoing=True ) self.set_bookmark() self.indices = set() lower_limit = lower_limit + datetime.timedelta(self.batch_size) upper_limit = min(datetime.datetime.utcnow(). replace(microsecond=0), lower_limit + datetime.timedelta(self.batch_size)) if lower_limit > upper_limit: break
def execute(self): """ Index data of specified queryset """ client = elasticsearch.Elasticsearch( hosts=settings.ELASTIC_SEARCH_HOSTS, # sniff_on_start=True, retry_on_timeout=True, refresh=True ) start_time = time.time() duration = time.time() loop_time = elapsed = duration - start_time for batch_i, total_batches, start, end, total, qs in self.batch_qs(): loop_start = time.time() total_left = ((total_batches - batch_i) * loop_time) progres_msg = \ '%s of %s : %8s %8s %8s duration: %.2f left: %.2f' % ( batch_i, total_batches, start, end, total, elapsed, total_left ) log.debug(progres_msg) helpers.bulk( client, (self.convert(obj).to_dict(include_meta=True) for obj in qs), raise_on_error=True, refresh=True ) now = time.time() elapsed = now - start_time loop_time = now - loop_start
def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger): es_doc = es_doc_cls() elasticsearch_conn = connections.get_connection() sync_timestamp = current_server_timestamp() pending_insertions = self._compute_dirty_documents( sql_table_cls, es_doc.doc_type) bulk_op = self._synchronisation_op(es_doc, pending_insertions) self._logging(logging.INFO, 'Performing synchronization.') for ok, info in parallel_bulk(elasticsearch_conn, bulk_op): obj_id = info['index']['_id'] \ if 'index' in info else info['update']['_id'] if ok: # Mark the task as handled so we don't retreat it next time self._logging(logging.INFO, 'Document %s has been synced successfully.' % obj_id) sql_table_cls.update_last_sync(obj_id, sync_timestamp) else: id_logger(obj_id, logging.ERROR, 'Error while syncing document %s index.' % obj_id) # Refresh indices to increase research speed elasticsearch_dsl.Index(es_doc.index).refresh()
def create_indices(endpoint): """ Creates constituent and address indices in PIC """ connections.connections.create_connection(hosts=[endpoint], timeout=360, max_retries=10, retry_on_timeout=True) pic_index = Index('pic') pic_index.doc_type(Constituent) pic_index.doc_type(Address) pic_index.delete(ignore=404) pic_index.settings( number_of_shards=5, number_of_replicas=2 ) pic_index.create()
def __init__(self, config_file='config.cfg'): super(Elastic, self).__init__() self.percentage=10.0 self.minimum_occurrences=250 # The ConfigParser documentation points out that there's no way to force defaults config option # outside the "DEFAULT" section. config = ConfigParser() config.read(config_file) if not config.has_section('elastic'): config.add_section('elastic') for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items(): if not config.has_option('elastic', option): config.set('elastic', option, value) self.version = config.getint('elastic', 'version') self.index = config.get('elastic', 'index') use_ssl = config.getboolean('elastic', 'use_ssl') host = config.get('elastic', 'host') self.doc_type = config.get('elastic', 'doc_type') self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True ) Event.init(index=self.index) index = Index(self.index, using=self.client) index.doc_type(Event) self.initialize_search()
def handle(self, *args, **options): if options['verbose']: log.setLevel(logging.DEBUG) es = search.init_es(timeout=2000) oldindex = Index(options['oldindex']) client = elasticsearch.client.IndicesClient(es) # Create the new index newindex = Index(options['newindex']) newindex.doc_type(search.Image) try: newindex.create() except elasticsearch.exceptions.RequestError as e: if options['force']: log.warn("Trying to delete previously-created new index %s", options['newindex']) newindex.delete() newindex.create() else: raise e log.info("Done creating new index %s", options['newindex']) log.info("Copying data on %s to %s", options['oldindex'], options['newindex']) # Would love to use ES native reindex() but AWS's service doesn't support it :( elasticsearch.helpers.reindex(es, options['oldindex'], options['newindex']) # Wait for it to be happy if not settings.DEBUG: es.cluster.health(wait_for_status='green', request_timeout=2000) # Is the value of 'oldindex' an alias or a real index? if client.exists_alias(name=settings.ELASTICSEARCH_INDEX): log.info("Confirmed that value of %s is an alias and not a real index" % options['oldindex']) alias_move = """{ "actions" : [ { "remove" : { "index" : "%s", "alias" : "%s" } }, { "add" : { "index" : "%s", "alias" : "%s" } } ] }""" % (options['oldindex'], settings.ELASTICSEARCH_INDEX, options['newindex'], settings.ELASTICSEARCH_INDEX) client.update_aliases(alias_move) elif client.exists(options['oldindex']): log.info("%s is a real index and not an alias, fixing" % options['oldindex']) # Delete the old index log.info("Deleting %s -- this will cause some downtime", options['oldindex']) oldindex.delete() client.put_alias(options['newindex'], settings.ELASTICSEARCH_INDEX) # Confirm number of documents in current settings s = Search() response = s.execute() log.info("%d results available in %s" % (response.hits.total, settings.ELASTICSEARCH_INDEX))