Python elasticsearch.helpers 模块,bulk() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用elasticsearch.helpers.bulk()

项目:mygene.info    作者:biothings    | 项目源码 | 文件源码
def index_bulk(self, docs, step=None):
        index_name = self.ES_INDEX_NAME
        doc_type = self.ES_INDEX_TYPE
        step = step or self.step

        def _get_bulk(doc):
            doc.update({
                "_index": index_name,
                "_type": doc_type,
            })
            return doc
        actions = (_get_bulk(doc) for doc in docs)
        try:
            return helpers.bulk(self.conn, actions, chunk_size=step)
        except helpers.BulkIndexError as e:
            # try again...
            print("Bulk error, try again...")
            return self.index_bulk(docs,step)
            ##return helpers.bulk(self.conn, actions, chunk_size=step)
        except Exception as e:
            print("Err...")
            import pickle
            pickle.dump(e,open("err","wb"))
项目:mygene.info    作者:biothings    | 项目源码 | 文件源码
def delete_docs(self, ids, step=None):
        index_name = self.ES_INDEX_NAME
        doc_type = self.ES_INDEX_TYPE
        step = step or self.step

        def _get_bulk(_id):
            doc = {
                '_op_type': 'delete',
                "_index": index_name,
                "_type": doc_type,
                "_id": _id
            }
            return doc
        actions = (_get_bulk(_id) for _id in ids)
        return helpers.bulk(self.conn, actions, chunk_size=step,
                            stats_only=True, raise_on_error=False)
项目:zing    作者:evernote    | 项目源码 | 文件源码
def handle(self, **options):
        self._initialize(**options)

        if (options['rebuild'] and
            not options['dry_run'] and
            self.es.indices.exists(self.INDEX_NAME)):

            self.es.indices.delete(index=self.INDEX_NAME)

        if (not options['dry_run'] and
            not self.es.indices.exists(self.INDEX_NAME)):

            self.es.indices.create(index=self.INDEX_NAME)

        if self.is_local_tm:
            self._set_latest_indexed_revision(**options)

        helpers.bulk(self.es, self._parse_translations(**options))
项目:ceres    作者:dicortazar    | 项目源码 | 文件源码
def upload_data(events_df, es_write_index, es_write):
    # Uploading info to the new ES
    rows = events_df.to_dict("index")
    docs = []
    for row_index in rows.keys():
        row = rows[row_index]
        item_id = row[Events.PERCEVAL_UUID] + "_" + row[Git.FILE_PATH] +\
            "_" + row[Git.FILE_EVENT]
        header = {
            "_index": es_write_index,
            "_type": "item",
            "_id": item_id,
            "_source": row
        }
        docs.append(header)
    helpers.bulk(es_write, docs)
    logging.info("Written: " + str(len(docs)))
项目:ceres    作者:dicortazar    | 项目源码 | 文件源码
def upload_data(events_df, es_write_index, es_write, uniq_id):
    # Uploading info to the new ES
    test = events_df.to_dict("index")
    docs = []
    for i in test.keys():
        header = {
               "_index": es_write_index,
               "_type": "item",
               "_id": int(uniq_id),
               "_source": test[i]
        }
        docs.append(header)
        uniq_id = uniq_id + 1
    print (len(docs))
    helpers.bulk(es_write, docs)
    items = []

    return uniq_id
项目:ceres    作者:dicortazar    | 项目源码 | 文件源码
def upload_data(events_df, es_write_index, es_write, uniq_id):
    # Uploading info to the new ES
    test = events_df.to_dict("index")
    docs = []
    for i in test.keys():
        header = {
               "_index": es_write_index,
               "_type": "item",
               "_id": int(uniq_id),
               "_source": test[i]
        }
        docs.append(header)
        uniq_id = uniq_id + 1
    print (len(docs))
    helpers.bulk(es_write, docs)
    items = []

    return uniq_id
项目:Hippocampe    作者:CERT-BDF    | 项目源码 | 文件源码
def update(typeNameES, listId):
    logger.info('bulkOp.update launched')
    hippoCfg = getHippoConf()
    es = getES()
    now = strftime("%Y%m%dT%H%M%S%z")
    indexNameES = hippoCfg.get('elasticsearch', 'indexNameES')
    # k is a generator expression that produces
    # dict to update every doc wich id is in listId
    k = ({'_op_type': 'update', '_index':indexNameES, '_type':typeNameES, 'doc':{'lastQuery': now}, '_id': id}
        for id in listId)

    res = helpers.bulk(es, k)
    logger.info('bulkOp.update res: %s', res)
    #res looks like
    #(2650, [])  
    logger.info('bulkOp.update end')
    return res[0]
项目:Hippocampe    作者:CERT-BDF    | 项目源码 | 文件源码
def index(cfgPath, listData):
    logger.info('bulkOp.index launched')
    hippoCfg = getHippoConf()
    indexNameES = hippoCfg.get('elasticsearch', 'indexNameES')

    cfg = getConf(cfgPath)
    typeNameES = cfg.get('elasticsearch', 'typeIntel')

    #creating the index, only if does not exist
    index = IndexIntel(cfgPath)
    index.createIndexIntel()

    es = getES()
    k = ({'_op_type': 'index', '_index':indexNameES, '_type':typeNameES, '_source': data}
        for data in listData)
    res = helpers.bulk(es,k, raise_on_error=False)
    #res = helpers.bulk(es,k, raise_on_exception=False)
    #res = helpers.bulk(es,k)
    logger.info('bulkOp.index res: %s', res)
    logger.info('bulkOp.index end')
    return res
项目:Hippocampe    作者:CERT-BDF    | 项目源码 | 文件源码
def indexNew(coreIntelligence, listData):
    logger.info('bulkOp.indexNew launched')

    hippoCfg = getHippoConf()
    indexNameES = hippoCfg.get('elasticsearch', 'indexNameES')
    typeNameES = hippoCfg.get('elasticsearch', 'typeNameESNew')

    indexNew = IndexNew()
    indexNew.createIndexNew()

    es = getES()
    k = ({'_op_type': 'index', '_index':indexNameES, '_type':typeNameES, '_source': {'type': coreIntelligence, 'toSearch': data[coreIntelligence]}}
        for data in listData) 
    #k.next() gives:
    #{'_op_type': 'index', '_index':'hippocampe', '_type':'new', '_source': {'typeIntel': 'ip', 'intelligence': '1.1.1.1'}
    res = helpers.bulk(es,k)
    logger.info('bulkOp.index res: %s', res)
    logger.info('bulkOp.indexNew end')  
    return res[0]
项目:WASE    作者:thomaspatzke    | 项目源码 | 文件源码
def genAddToES(self, msgs, component):
        def menuAddToES(e):
            progress = ProgressMonitor(component, "Feeding ElasticSearch", "", 0, len(msgs))
            i = 0
            docs = list()
            for msg in msgs:
                if not Burp_onlyResponses or msg.getResponse():
                    docs.append(self.genESDoc(msg, timeStampFromResponse=True).to_dict(True))
                i += 1
                progress.setProgress(i)
            success, failed = bulk(self.es, docs, True, raise_on_error=False)
            progress.close()
            JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Successful imported %d messages, %d messages failed.</p></html>" % (success, failed), "Finished", JOptionPane.INFORMATION_MESSAGE)
        return menuAddToES

    ### Interface to ElasticSearch ###
项目:repoxplorer    作者:morucci    | 项目源码 | 文件源码
def update_commits(self, source_it, field='repos'):
        """ Take the sha from each doc and use
        it to reference the doc to update. This method only
        support updating a single field for now. The default one
        is repos because that's the only one to make sense in
        this context.
        """
        def gen(it):
            for source in it:
                d = {}
                d['_index'] = self.index
                d['_type'] = self.dbname
                d['_op_type'] = 'update'
                d['_id'] = source['sha']
                d['_source'] = {'doc': {field: source[field]}}
                yield d
        bulk(self.es, gen(source_it))
        self.es.indices.refresh(index=self.index)
项目:elasticsearch-bench    作者:anuragkh    | 项目源码 | 文件源码
def load_data(input_file, index, doc_type, seed):
  doc_no = seed
  successful = 0
  docs = []
  with open(input_file) as ifp:
    for line in ifp:
      doc_id = str(doc_no)
      doc = csv2json(index, doc_type, doc_id, line.rstrip())
      docs.append(doc)
      doc_no += 1
      if len(docs) == batch_size:
        docs_iter = iter(docs)
        (added, tmp) = helpers.bulk(es, docs_iter)
        successful += added
        docs = []
      if doc_no % 100000 == 0:
        print 'success: %d failed: %s' % (successful, doc_no - successful - seed)

  if len(docs) > 0:
    docs_iter = iter(docs)
    (added, tmp) = helpers.bulk(es, docs_iter)
    successful += added

  print 'Finished! Inserted: %d Failed: %d' % (successful, doc_no - successful - seed)
项目:elasticsearch_loader    作者:moshe    | 项目源码 | 文件源码
def single_bulk_to_es(bulk, config, attempt_retry):
    bulk = bulk_builder(bulk, config)

    max_attempt = 1
    if attempt_retry:
        max_attempt += 3

    for attempt in range(1, max_attempt+1):
        try:
            helpers.bulk(config['es_conn'], bulk, chunk_size=config['bulk_size'])
        except Exception as e:
            if attempt < max_attempt:
                wait_seconds = attempt*3
                log('warn', 'attempt [%s/%s] got exception, will retry after %s seconds' % (attempt,max_attempt,wait_seconds) )
                time.sleep(wait_seconds)
                continue

            log('error', 'attempt [%s/%s] got exception, it is a permanent data loss, no retry any more' % (attempt,max_attempt) )
            raise e

        if attempt > 1:
            log('info', 'attempt [%s/%s] succeed. we just get recovered from previous error' % (attempt,max_attempt) )

        # completed succesfully
        break
项目:sigir2017-table    作者:iai-group    | 项目源码 | 文件源码
def add_docs_bulk(self, docs):
        """Adds a set of documents to the index in a bulk.

        :param docs: dictionary {doc_id: doc}
        """
        actions = []
        for doc_id, doc in docs.items():
            action = {
                "_index": self.__index_name,
                "_type": self.DOC_TYPE,
                "_id": doc_id,
                "_source": doc
            }
            actions.append(action)

        if len(actions) > 0:
            helpers.bulk(self.__es, actions)
项目:parade    作者:bailaohe    | 项目源码 | 文件源码
def store(self, df, table, **kwargs):
        if isinstance(df, pd.DataFrame):
            es = self.open()

            records = df.to_dict(orient='records')

            if df.index.name:
                actions = [{
                    "_index": self.datasource.db,
                    "_type": table,
                    "_id": record[df.index.name],
                    "_source": record
                } for record in records]
            else:
                actions = [{
                    "_index": self.datasource.db,
                    "_type": table,
                    "_source": record
                } for record in records]

            if len(actions) > 0:
                helpers.bulk(es, actions)
项目:EventMonkey    作者:devgc    | 项目源码 | 文件源码
def BulkIndexRecords(self,records):
        '''
        Bulk Index Records
        IN
            self: EsHandler
            records: a list of records to bulk index
        '''
        ELASTIC_LOGGER.debug('[starting] Indexing Bulk Records')
        success_count,failed_items = es_bulk(
            self.esh,
            records,
            chunk_size=10000,
            raise_on_error=False
        )

        if len(failed_items) > 0:
            ELASTIC_LOGGER.error('[PID {}] {} index errors'.format(
                os.getpid(),len(failed_items)
            ))
            for failed_item in failed_items:
                ELASTIC_LOGGER.error(unicode(failed_item))

        ELASTIC_LOGGER.debug('[finished] Indexing Bulk Records')
项目:elastic2-doc-manager    作者:mongodb-labs    | 项目源码 | 文件源码
def send_buffered_operations(self):
        """Send buffered operations to Elasticsearch.

        This method is periodically called by the AutoCommitThread.
        """
        with self.lock:
            try:
                action_buffer = self.BulkBuffer.get_buffer()
                if action_buffer:
                    successes, errors = bulk(self.elastic, action_buffer)
                    LOG.debug("Bulk request finished, successfully sent %d "
                              "operations", successes)
                    if errors:
                        LOG.error(
                            "Bulk request finished with errors: %r", errors)
            except es_exceptions.ElasticsearchException:
                LOG.exception("Bulk request failed with exception")
项目:elastic2-doc-manager    作者:mongodb-labs    | 项目源码 | 文件源码
def __init__(self, docman):

        # Parent object
        self.docman = docman

        # Action buffer for bulk indexing
        self.action_buffer = []

        # Docs to update
        # Dict stores all documents for which firstly
        # source has to be retrieved from Elasticsearch
        # and then apply_update needs to be performed
        # Format: [ (doc, update_spec, action_buffer_index, get_from_ES) ]
        self.doc_to_update = []

        # Below dictionary contains ids of documents
        # which need to be retrieved from Elasticsearch
        # It prevents from getting same document multiple times from ES
        # Format: {"_index": {"_type": {"_id": True}}}
        self.doc_to_get = {}

        # Dictionary of sources
        # Format: {"_index": {"_type": {"_id": {"_source": actual_source}}}}
        self.sources = {}
项目:cnschema    作者:cnschema    | 项目源码 | 文件源码
def load_data(self):
        """
        https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-suggesters-completion.html
        """
        es = self.connect()

        items = self._load_item_data()
        helpers.bulk(es, items)

        # id_field = "id"
        # es_index = self.es_config["es_index"]
        # es_type = self.es_config["es_type"]
        # for item in items:
        #     item = item['_source']
        #     logging.info(json.dumps(item, ensure_ascii=False, indent=4))
        #
        #     ret = es.index(index=es_index, doc_type=es_type, id=item[id_field], body=item)
        #     logging.info(ret)
项目:django-elasticsearch-dsl    作者:sabricot    | 项目源码 | 文件源码
def update(self, thing, refresh=None, action='index', **kwargs):
        """
        Update each document in ES for a model, iterable of models or queryset
        """
        if refresh is True or (
            refresh is None and self._doc_type.auto_refresh
        ):
            kwargs['refresh'] = True

        if isinstance(thing, models.Model):
            object_list = [thing]
        else:
            object_list = thing

        return self.bulk(
            self._get_actions(object_list, action), **kwargs
        )
项目:elasticsearch-fabric    作者:KunihikoKido    | 项目源码 | 文件源码
def bulk(chunk_size=100, filepath=None, **kwargs):
    if sys.stdin.isatty() is False:
        infile = sys.stdin
    elif filepath is not None:
        infile = open(filepath, "r")
    else:
        abort(bulk.__doc__)

    es = get_client(env.elasticsearch_alias)
    actions = []
    for action in infile.readlines():
        actions.append(json.loads(action))

    success, errors = helpers.bulk(es, actions, ignore=IGNORE, **kwargs)
    res = {
        "success": success, "errors": errors,
        "bulk": {
            "host": es.transport.get_connection().host
        }
    }
    infile.close()

    jsonprint(res)
    return res
项目:elastic-doc-manager    作者:mongodb-labs    | 项目源码 | 文件源码
def send_buffered_operations(self):
        """Send buffered operations to Elasticsearch.

        This method is periodically called by the AutoCommitThread.
        """
        with self.lock:
            try:
                action_buffer = self.BulkBuffer.get_buffer()
                if action_buffer:
                    successes, errors = bulk(self.elastic, action_buffer)
                    LOG.debug("Bulk request finished, successfully sent %d "
                              "operations", successes)
                    if errors:
                        LOG.error(
                            "Bulk request finished with errors: %r", errors)
            except es_exceptions.ElasticsearchException:
                LOG.exception("Bulk request failed with exception")
项目:elastic-doc-manager    作者:mongodb-labs    | 项目源码 | 文件源码
def __init__(self, docman):

        # Parent object
        self.docman = docman

        # Action buffer for bulk indexing
        self.action_buffer = []

        # Docs to update
        # Dict stores all documents for which firstly
        # source has to be retrieved from Elasticsearch
        # and then apply_update needs to be performed
        # Format: [ (doc, update_spec, action_buffer_index, get_from_ES) ]
        self.doc_to_update = []

        # Below dictionary contains ids of documents
        # which need to be retrieved from Elasticsearch
        # It prevents from getting same document multiple times from ES
        # Format: {"_index": {"_type": {"_id": True}}}
        self.doc_to_get = {}

        # Dictionary of sources
        # Format: {"_index": {"_type": {"_id": {"_source": actual_source}}}}
        self.sources = {}
项目:fccforensics    作者:RagtagOpen    | 项目源码 | 文件源码
def bulk_update(es, actions, batch_size=250):
    indexed = 0
    for i in range(0, len(actions), batch_size):
        resp = bulk(es, actions[i:(i+batch_size)])
        indexed += resp[0]
        print('\tindexed %s / %s' % (indexed, len(actions)))
    return indexed
项目:fccforensics    作者:RagtagOpen    | 项目源码 | 文件源码
def index_worker(self, queue, size=200):
        actions = []
        indexed = 0
        while True:
            item = queue.get()
            if item is None:
                break
            id_submission, analysis = item

            doc = {
                '_index': 'fcc-comments',
                '_type': 'document',
                '_op_type': 'update',
                '_id': id_submission,
                'doc': {'analysis': analysis},
            }
            actions.append(doc)

            if len(actions) == size:
                with warnings.catch_warnings():
                    warnings.simplefilter('ignore')
                    try:
                        response = bulk(self.es, actions)
                        indexed += response[0]
                        print('\tanalyzed %s/%s\t%s%%' % (indexed, self.limit,
                            int(indexed / self.limit * 100)))
                        actions = []
                    except ConnectionTimeout:
                        print('error indexing: connection timeout')

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            response = bulk(self.es, actions)
            indexed += response[0]
            print('indexed %s' % (indexed))
项目:fccforensics    作者:RagtagOpen    | 项目源码 | 文件源码
def bulk_index(self, queue, size=20):

        actions = []
        indexed = 0
        ids = set()
        while True:
            item = queue.get()
            if item is None:
                break
            doc_id = item

            doc = {
                '_index': 'fcc-comments',
                '_type': 'document',
                '_op_type': 'update',
                '_id': doc_id,
                'doc': {'analysis.sentiment_sig_terms_ordered': True},
            }
            actions.append(doc)
            ids.add(doc_id)

            if len(actions) == size:
                with warnings.catch_warnings():
                    warnings.simplefilter('ignore')
                    try:
                        response = bulk(self.es, actions)
                        indexed += response[0]
                        if not indexed % 200:
                            print('\tindexed %s/%s\t%s%%' % (indexed, self.limit,
                                int(indexed / self.limit * 100)))
                        actions = []
                    except ConnectionTimeout:
                        print('error indexing: connection timeout')

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            response = bulk(self.es, actions)
            indexed += response[0]
            print('indexed %s' % (indexed))
        ids = list(ids)
        #print('%s\n%s' % (len(ids), ' '.join(ids))
项目:defplorex    作者:trendmicro    | 项目源码 | 文件源码
def bulk_index_from_it(
            self, index, it, transform=lambda x: x, last_updated=True):

        gc.collect()
        err_ids = []

        def _it():
            for doc_body in it:
                try:
                    log.debug('Working on record: %s', doc_body)
                    _id = doc_body.get(self.id_field)

                    try:
                        doc_body = transform(doc_body)
                    except Exception as e:
                        log.warn(
                                'Error while transforming doc ID = %s: %s',
                                _id, e)
                        raise e

                    if doc_body:
                        if last_updated:
                            doc_body['last_updated'] = datetime.now()

                        op = self.partial_index_op(
                                doc_id=_id,
                                index=index,
                                doc_body=doc_body,
                                doc_type=self.doc_type)
                        yield op
                except Exception as e:
                    log.warn('Cannot process doc ID = %s: %s', _id, e)
                    err_ids.append(_id)

        try:
            self.bulk(_it())
            log.info('Invoked self.bulk(_it())')
        except Exception as e:
            log.warn('Error in bulk index because: %s', e)

        return err_ids
项目:defplorex    作者:trendmicro    | 项目源码 | 文件源码
def bulk(self, it):
        try:
            log.info('Sending bulk request on iterable/generator')
            args = dict(client=self.client,
                        actions=it,
                        chunk_size=self.bulk_size,
                        raise_on_exception=False,
                        raise_on_error=False,
                        stats_only=False,
                        request_timeout=self.timeout)

            res_succ, res_err = helpers.bulk(**args)

            log.info(
                    'Sent bulk request on queue iterator: '
                    'successfull ops = %d, failed ops = %d',
                    res_succ, len(res_err))

            for res in res_err:
                log.warn('Error response: %s', res)
        except Exception as e:
            log.error('Error in storing: %s', e, exc_info=True)
项目:log-ioc    作者:willylong275    | 项目源码 | 文件源码
def bulk_index(self, index_name, dict_list):
                res = helpers.bulk( self.es, dict_list)
                print(" response: '%s'" % (str(res)))
                print res
                print str(res)
                return
项目:samnorsk    作者:gisleyt    | 项目源码 | 文件源码
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--dump-file')
    parser.add_argument('-e', '--elasticsearch-host', default='localhost:9200')
    parser.add_argument('-i', '--index', default='wikipedia')
    parser.add_argument('-l', '--limit', default=0, type=int)
    parser.add_argument('-p', '--id-prefix')
    opts = parser.parse_args()

    dump_fn = opts.dump_file
    es_host = opts.elasticsearch_host
    es_index = opts.index
    limit = opts.limit if opts.limit > 0 else None
    prefix = opts.id_prefix

    if not dump_fn:
        logging.error('missing filenames ...')
        sys.exit(1)

    gen = articles(dump_fn, limit=limit)

    es = Elasticsearch(hosts=[es_host])
    ic = IndicesClient(es)

    if not ic.exists(es_index):
        ic.create(es_index)

    while True:
        chunk = islice(gen, 0, 1000)

        actions = [{'_index': es_index,
                    '_type': 'article',
                    '_id': article['id'] if not prefix else '%s-%s' % (prefix, article['id']),
                    '_source': article}
                   for article in chunk]

        if not actions:
            break

        helpers.bulk(es, actions)
项目:mygene.info    作者:biothings    | 项目源码 | 文件源码
def update(self, id, extra_doc, index_type=None, bulk=False):
        '''update an existing doc with extra_doc.'''
        conn = self.conn
        index_name = self.ES_INDEX_NAME
        index_type = index_type or self.ES_INDEX_TYPE
        # old way, update locally and then push it back.
        # return self.conn.update(extra_doc, self.ES_INDEX_NAME,
        #                         index_type, id)

        if not bulk:
            body = {'doc': extra_doc}
            return conn.update(index_name, index_type, id, body)
        else:
            raise NotImplementedError
            '''
            # ES supports bulk update since v0.90.1.
            op_type = 'update'
            cmd = {op_type: {"_index": index_name,
                             "_type": index_type,
                             "_id": id}
                   }

            doc = json.dumps({"doc": extra_doc}, cls=conn.encoder)
            command = "%s\n%s" % (json.dumps(cmd, cls=conn.encoder), doc)
            conn.bulker.add(command)
            return conn.flush_bulk()
            '''
项目:mygene.info    作者:biothings    | 项目源码 | 文件源码
def update_docs(self, partial_docs, **kwargs):
        index_name = self.ES_INDEX_NAME
        doc_type = self.ES_INDEX_TYPE

        def _get_bulk(doc):
            doc = {
                '_op_type': 'update',
                "_index": index_name,
                "_type": doc_type,
                "_id": doc['_id'],
                "doc": doc
            }
            return doc
        actions = (_get_bulk(doc) for doc in partial_docs)
        return helpers.bulk(self.conn, actions, chunk_size=self.step, **kwargs)
项目:chatbot_ner    作者:hellohaptik    | 项目源码 | 文件源码
def create_all_dictionary_data(connection, index_name, doc_type, logger, entity_data_directory_path=None,
                               csv_file_paths=None, **kwargs):
    """
    Indexes all entity data from csv files stored at entity_data_directory_path, one file at a time
    Args:
        connection: Elasticsearch client object
        index_name: The name of the index
        doc_type: The type of the documents being indexed
        logger: logging object to log at debug and exception level
        entity_data_directory_path: Optional, Path of the directory containing the entity data csv files.
                                    Default is None
        csv_file_paths: Optional, list of file paths to csv files. Default is None
        kwargs:
            Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk

    """
    logger.debug('%s: +++ Started: create_all_dictionary_data() +++' % log_prefix)
    if entity_data_directory_path:
        logger.debug('%s: \t== Fetching from variants/ ==' % log_prefix)
        csv_files = get_files_from_directory(entity_data_directory_path)
        for csv_file in csv_files:
            csv_file_path = os.path.join(entity_data_directory_path, csv_file)
            create_dictionary_data_from_file(connection=connection, index_name=index_name, doc_type=doc_type,
                                             csv_file_path=csv_file_path, update=False, logger=logger, **kwargs)
    if csv_file_paths:
        for csv_file_path in csv_file_paths:
            if csv_file_path and csv_file_path.endswith('.csv'):
                create_dictionary_data_from_file(connection=connection, index_name=index_name, doc_type=doc_type,
                                                 csv_file_path=csv_file_path, update=False, logger=logger, **kwargs)
    logger.debug('%s: +++ Finished: create_all_dictionary_data() +++' % log_prefix)
项目:chatbot_ner    作者:hellohaptik    | 项目源码 | 文件源码
def recreate_all_dictionary_data(connection, index_name, doc_type, logger, entity_data_directory_path=None,
                                 csv_file_paths=None, **kwargs):
    """
    Re-indexes all entity data from csv files stored at entity_data_directory_path, one file at a time
    Args:
        connection: Elasticsearch client object
        index_name: The name of the index
        doc_type: The type of the documents being indexed
        logger: logging object to log at debug and exception level
        entity_data_directory_path: Optional, Path of the directory containing the entity data csv files.
                                    Default is None
        csv_file_paths: Optional, list of file paths to csv files. Default is None
        kwargs:
            Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk

    """
    logger.debug('%s: +++ Started: recreate_all_dictionary_data() +++' % log_prefix)
    if entity_data_directory_path:
        logger.debug('%s: \t== Fetching from variants/ ==' % log_prefix)
        csv_files = get_files_from_directory(entity_data_directory_path)
        for csv_file in csv_files:
            csv_file_path = os.path.join(entity_data_directory_path, csv_file)
            create_dictionary_data_from_file(connection=connection, index_name=index_name, doc_type=doc_type,
                                             csv_file_path=csv_file_path, update=True, logger=logger, **kwargs)
    if csv_file_paths:
        for csv_file_path in csv_file_paths:
            if csv_file_path and csv_file_path.endswith('.csv'):
                create_dictionary_data_from_file(connection=connection, index_name=index_name, doc_type=doc_type,
                                                 csv_file_path=csv_file_path, update=True, logger=logger, **kwargs)
    logger.debug('%s: +++ Finished: recreate_all_dictionary_data() +++' % log_prefix)
项目:chatbot_ner    作者:hellohaptik    | 项目源码 | 文件源码
def get_variants_dictionary_value_from_key(csv_file_path, dictionary_key, logger, **kwargs):
    """
    Reads the csv file at csv_file_path and create a dictionary mapping entity value to a list of their variants.
    the entity values are first column of the csv file and their corresponding variants are stored in the second column
    delimited by '|'

    Args:
        csv_file_path: absolute file path of the csv file populate entity data from
        dictionary_key: name of the entity to be put the values under
        logger: logging object to log at debug and exception level
        kwargs:
            Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk

    Returns:
        Dictionary mapping entity value to a list of their variants.
    """
    dictionary_value = defaultdict(list)
    try:
        csv_reader = read_csv(csv_file_path)
        next(csv_reader)
        for data_row in csv_reader:
            try:
                data = map(str.strip, data_row[1].split('|'))
                # remove empty strings
                data = [variant for variant in data if variant]
                dictionary_value[data_row[0].strip().replace('.', ' ')].extend(data)

            except Exception as e:
                logger.exception('%s: \t\t== Exception in dict creation for keyword: %s -- %s -- %s =='
                                 % (log_prefix, dictionary_key, data_row, e))

    except Exception as e:
        logger.exception(
            '%s: \t\t\t=== Exception in __get_variants_dictionary_value_from_key() Dictionary Key: %s \n %s  ===' % (
                log_prefix,
                dictionary_key, e.message))

    return dictionary_value
项目:chatbot_ner    作者:hellohaptik    | 项目源码 | 文件源码
def create_dictionary_data_from_file(connection, index_name, doc_type, csv_file_path, update, logger, **kwargs):
    """
    Indexes all entity data from the csv file at path csv_file_path
    Args:
        connection: Elasticsearch client object
        index_name: The name of the index
        doc_type:  The type of the documents being indexed
        csv_file_path: absolute file path of the csv file to populate entity data from
        update: boolean, True if this is a update type operation, False if create/index type operation
        logger: logging object to log at debug and exception level
        kwargs:
            Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk
    """

    base_file_name = os.path.basename(csv_file_path)
    dictionary_key = os.path.splitext(base_file_name)[0]

    if update:
        delete_entity_by_name(connection=connection, index_name=index_name, doc_type=doc_type,
                              entity_name=dictionary_key, logger=logger, **kwargs)
    dictionary_value = get_variants_dictionary_value_from_key(csv_file_path=csv_file_path,
                                                              dictionary_key=dictionary_key, logger=logger,
                                                              **kwargs)
    if dictionary_value:
        add_data_elastic_search(connection=connection, index_name=index_name, doc_type=doc_type,
                                dictionary_key=dictionary_key,
                                dictionary_value=remove_duplicate_data(dictionary_value), logger=logger, **kwargs)
    if os.path.exists(csv_file_path) and os.path.splitext(csv_file_path)[1] == '.csv':
        os.path.basename(csv_file_path)
项目:micromasters    作者:mitodl    | 项目源码 | 文件源码
def _index_chunk(chunk, doc_type, index):
    """
    Add/update a list of records in Elasticsearch

    Args:
        chunk (list):
            List of serialized items to index
        doc_type (str):
            The doc type for each item
        index (str): An Elasticsearch index

    Returns:
        int: Number of items inserted into Elasticsearch
    """

    conn = get_conn(verify_index=index)
    insert_count, errors = bulk(
        conn,
        chunk,
        index=index,
        doc_type=doc_type,
    )
    if len(errors) > 0:
        raise ReindexException("Error during bulk insert: {errors}".format(
            errors=errors
        ))

    refresh_index(index)
    return insert_count
项目:elasticsearch-copy    作者:xcxsxvx    | 项目源码 | 文件源码
def output_es(es, records, start_record_num, end_record_num, total_records, per_batch):
  print("Inserting records %d through %d of %s" % (start_record_num, end_record_num,
    (str(total_records) if total_records > 0 else '???')))
  num_success, error_list = helpers.bulk(es, records, chunk_size=1000)
  if num_success != per_batch:
    print("[ERROR] %d of %d inserts succeeded!" % (num_success,per_batch))
    print("[ERROR] Errors:")
    print error_list
项目:open-ledger    作者:creativecommons    | 项目源码 | 文件源码
def insert_image(chunk_size, max_results=5000, from_file=None):
    count = 0
    success_count = 0
    es = search.init()
    search.Image.init()
    mapping = search.Image._doc_type.mapping
    mapping.save(settings.ELASTICSEARCH_INDEX)

    for chunk in grouper_it(chunk_size, import_from_file(from_file)):
        if not from_file and count >= max_results:  # Load everything if loading from file
            break
        else:
            images = []
            for result in chunk:
                images.append(result)
            if len(images) > 0:
                try:
                    # Bulk update the search engine too
                    search_objs = [search.db_image_to_index(img).to_dict(include_meta=True) for img in images]
                    models.Image.objects.bulk_create(images)
                    helpers.bulk(es, search_objs)
                    log.debug("*** Committed set of %d images", len(images))
                    success_count += len(images)
                except IntegrityError as e:
                    log.warn("Got one or more integrity errors on batch: %s", e)
                finally:
                    count += len(images)
    return success_count
项目:knowledge-graph    作者:MixedEmotions    | 项目源码 | 文件源码
def write2ES(AllData, indexName, typeName , elasticPort=9220, elasticHost="localhost"):
        es = Elasticsearch([{'host': elasticHost, 'port': elasticPort}], http_auth=(elasticUsername, elasticPassword))
        messages = []
        logging.debug('Running preset')
        for record in AllData:
            #   print(record)
                tmpMap = {"_op_type": "index", "_index": indexName, "_type": typeName }
                if len(record)>2:
                    tmpMap.update({"label":record[0].replace("http://dbpedia.org/resource/",""), "entity_linking":{"URI":record[0].replace("/resource/","/page/"),"connection":record[1] ,"target":record[2].replace("/resource/","/page/")}})
                else:
                    tmpMap.update({"label":record[0].replace("http://dbpedia.org/resource/",""), "entity_linking":{"URI":record[0].replace("/resource/","/page/"),"target":record[1]}})
                messages.append(tmpMap)
        #print messages
        result = bulk(es, messages)
        return result
项目:knowledge-graph    作者:MixedEmotions    | 项目源码 | 文件源码
def writeDashboard2ES(name, indexName=".kibi", typeName="dashboard" , elasticPort=9220, elasticHost="localhost"):
        es = Elasticsearch([{'host': elasticHost, 'port': elasticPort}], http_auth=(elasticUsername, elasticPassword))
        messages = []
        PercentageEmotions = "PercentageEmotions"
        logging.debug('Writing dashboard')
        print('Writing dashboard')
        PercentageEmotions = "PercentageEmotions"
        EmotionDistribution = "EmotionDistribution"
        cloudVisualization= name[:name.find("_")]+"Cloud"
        nameFull= name[:name.find("_")]

        tmpMap = {"_op_type": "index", "_index": indexName, "_type": typeName, "_id":name }
        tmpMap.update({"defaultIndex": "reviews", "kibi:relationalPanel": "true"})
        tmpMap.update({"savedSearchId": name})
        tmpMap.update({"sort": ["_score", "desc" ], "version": 1,"description": "", "hits": 0, "optionsJSON": "{\"darkTheme\":false}", "uiStateJSON": "{}", "timeRestore": "false"})
        tmpMap.update({"kibanaSavedObjectMeta": { "searchSourceJSON": "{\"filter\":[{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}}}]}"}})
        if name.find("tweets")>= 0:
            tmpMap.update({ "title":name, "panelsJSON": "[{\"id\":\""+name+"\",\"type\":\"search\",\"panelIndex\":3,\"size_x\":7,\"size_y\":6,\"col\":1,\"row\":1,\"columns\":[\"emotions.emotion\",\"detected_entities\",\"text\"],\"sort\":[\"entity_linking.URI\",\"desc\"]},{\"id\":\""+EmotionDistribution+"\",\"type\":\"visualization\",\"panelIndex\":4,\"size_x\":5,\"size_y\":4,\"col\":8,\"row\":3},{\"id\":\""+PercentageEmotions+"\",\"type\":\"visualization\",\"panelIndex\":5,\"size_x\":5,\"size_y\":2,\"col\":8,\"row\":1}]"})
        else:
        #   tmpMap.update({ "title":name,"panelsJSON": "[{\"id\":\""+name+"\",\"type\":\"search\",\"panelIndex\":1,\"size_x\":6,\"size_y\":6,\"col\":1,\"row\":1,\"columns\":[\"label\",\"entity_linking.target\"],\"sort\":[\"_score\",\"desc\"]},{\"id\":\"entityEmotion\",\"type\":\"visualization\",\"panelIndex\":3,\"size_x\":6,\"size_y\":4,\"col\":7,\"row\":6},{\"id\":\"Location\",\"type\":\"search\",\"panelIndex\":2,\"size_x\":6,\"size_y\":2,\"col\":1,\"row\":7,\"columns\":[\"label\",\"entity_linking.connection\",\"entity_linking.target\"],\"sort\":[\"connection\",\"asc\"]},{\"id\":\"PercentageEmotions\",\"type\":\"visualization\",\"panelIndex\":4,\"size_x\":6,\"size_y\":5,\"col\":7,\"row\":1}]"})
            tmpMap.update({ "title":name,"panelsJSON": "[{\"col\":1,\"columns\":[\"label\",\"entity_linking.target\"],\"id\":\""+name+"\",\"panelIndex\":1,\"row\":1,\"size_x\":6,\"size_y\":2,\"sort\":[\"_score\",\"desc\"],\"type\":\"search\"},{\"col\":7,\"id\":\""+PercentageEmotions+"\",\"panelIndex\":4,\"row\":1,\"size_x\":6,\"size_y\":3,\"type\":\"visualization\"},{\"col\":1,\"columns\":[\"label\",\"entity_linking.connection\",\"entity_linking.target\"],\"id\":\""+nameFull+"\",\"panelIndex\":2,\"row\":3,\"size_x\":6,\"size_y\":2,\"sort\":[\"connection\",\"asc\"],\"type\":\"search\"},{\"id\":\""+cloudVisualization+"\",\"type\":\"visualization\",\"panelIndex\":5,\"size_x\":3,\"size_y\":2,\"col\":1,\"row\":5},{\"id\":\""+EmotionDistribution+"\",\"type\":\"visualization\",\"panelIndex\":6,\"size_x\":6,\"size_y\":3,\"col\":7,\"row\":4}]"})
             #"[{\"id\":\""+name+"\",\"type\":\"search\",\"panelIndex\":1,\"size_x\":12,\"size_y\":6,\"col\":1,\"row\":1,\"columns\":[\"label\", \"entity_linking.connection\", \"entity_linking.target\"],\"sort\":[\"_score\",\"desc\"]}]"})
        messages.append(tmpMap)
        print (messages)
        result = bulk(es, messages)
        return result
项目:repoxplorer    作者:morucci    | 项目源码 | 文件源码
def add_commits(self, source_it):
        def gen(it):
            for source in it:
                d = {}
                d['_index'] = self.index
                d['_type'] = self.dbname
                d['_op_type'] = 'create'
                d['_id'] = source['sha']
                d['_source'] = source
                yield d
        bulk(self.es, gen(source_it))
        self.es.indices.refresh(index=self.index)
项目:repoxplorer    作者:morucci    | 项目源码 | 文件源码
def del_commits(self, sha_list):
        def gen(it):
            for sha in it:
                d = {}
                d['_index'] = self.index
                d['_type'] = self.dbname
                d['_op_type'] = 'delete'
                d['_id'] = sha
                yield d
        bulk(self.es, gen(sha_list))
        self.es.indices.refresh(index=self.index)
项目:handelsregister    作者:Amsterdam    | 项目源码 | 文件源码
def execute(self):
        """
        Index data of specified queryset
        """
        client = elasticsearch.Elasticsearch(
            hosts=settings.ELASTIC_SEARCH_HOSTS,
            # sniff_on_start=True,
            retry_on_timeout=True,
            refresh=True
        )

        start_time = time.time()
        duration = time.time()
        loop_time = elapsed = duration - start_time

        for batch_i, total_batches, start, end, total, qs in self.batch_qs():

            loop_start = time.time()
            total_left = ((total_batches - batch_i) * loop_time)

            progres_msg = \
                '%s of %s : %8s %8s %8s duration: %.2f left: %.2f' % (
                    batch_i, total_batches, start, end, total, elapsed,
                    total_left
                )

            log.debug(progres_msg)

            helpers.bulk(
                client, (self.convert(obj).to_dict(include_meta=True)
                         for obj in qs),
                raise_on_error=True,
                refresh=True
            )

            now = time.time()
            elapsed = now - start_time
            loop_time = now - loop_start
项目:find-that-charity    作者:TechforgoodCAST    | 项目源码 | 文件源码
def save_to_elasticsearch(chars, es, es_index):

    print('\r', "[elasticsearch] %s charities to save" % len(chars))
    print('\r', "[elasticsearch] saving %s charities to %s index" % (len(chars), es_index))
    results = bulk(es, list(chars.values()))
    print('\r', "[elasticsearch] saved %s charities to %s index" % (results[0], es_index))
    print('\r', "[elasticsearch] %s errors reported" % len(results[1]))
项目:elasticsearch_loader    作者:moshe    | 项目源码 | 文件源码
def load(lines, config):
    bulks = grouper(lines, config['bulk_size'] * 3)
    if config['progress']:
        bulks = [x for x in bulks]
    with click.progressbar(bulks) as pbar:
        for i, bulk in enumerate(pbar):
            try:
                single_bulk_to_es(bulk, config, config['with_retry'])
            except Exception as e:
                log('warn', 'Chunk {i} got exception ({e}) while processing'.format(e=e, i=i))
项目:cfme-performance    作者:redhat-performance    | 项目源码 | 文件源码
def gen_action(self, **kwargs):
        """
        single ES doc that gets consumed inside bulk uploads
        """
        action = {
            "_op_type": _op_type,
            "_index": kwargs['index_name'],
            "_type": kwargs['doc_type'],
            "_id": kwargs['uid'],
            # "@timestamp": kwargs['timestamp'],
            "_source": kwargs['data']
        }
        return action
项目:ml-anomaly-injector    作者:plinde    | 项目源码 | 文件源码
def buildEvent(timestamp = None):

    event = {}

    #if we don't have a desired timestamp passed in for the event, use the current time in UTC
    if timestamp == None:
        event['timestamp'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
    else:
        event['timestamp'] = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')

    #TODO make some of these random inputs from seed lists

    #add these 2 for bulk API goodness
    event['_index'] = 'smoke_event'
    event['_type'] = 'smoke_event'

    event['request'] = '/index.html'
    event['response'] = '200'
    event['agent'] = 'Firefox'
    event['remote_ip'] = '1.1.1.1'
    event['remote_user'] = ''
    event['bytes'] = '1234'
    event['referrer'] = 'http://example.com'

    json_event = json.dumps(event)

    return json_event
项目:ml-anomaly-injector    作者:plinde    | 项目源码 | 文件源码
def main():

    bulkSize = 10000 # elasticsearch bulk size
    daysBack = 7

    anomalyPeriod = 30 # period for anomaly to last, in minutes
    anomalyMagnification = 10 # e.g. 10x more than the normal

    buildEventSeries(daysBack, bulkSize)
    buildAnomalyEventSeries(daysBack, anomalyPeriod, anomalyMagnification, bulkSize)