我们从Python开源项目中,提取了以下45个代码示例,用于说明如何使用elasticsearch_dsl.Q。
def get_summary_statistics(): """ Obtains statistics about current sum of flows, packets, bytes. :return: JSON with status "ok" or "error" and requested data. """ try: # Elastic query client = elasticsearch.Elasticsearch([{'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port')}]) elastic_bool = [] elastic_bool.append({'range': {'@timestamp': {'gte': "now-5m", 'lte': "now"}}}) elastic_bool.append({'term': {'@type': 'protocols_statistics'}}) qx = Q({'bool': {'must': elastic_bool}}) s = Search(using=client, index='_all').query(qx) s.aggs.bucket('sum_of_flows', 'sum', field='flows') s.aggs.bucket('sum_of_packets', 'sum', field='packets') s.aggs.bucket('sum_of_bytes', 'sum', field='bytes') s.sort('@timestamp') result = s.execute() # Result Parsing into CSV in format: timestamp, tcp protocol value, udp protocol value data = "Timestamp, Flows, Packets, Bytes;" timestamp = "Last 5 Minutes" data += timestamp + ', ' +\ str(int(result.aggregations.sum_of_flows['value'])) + ', ' +\ str(int(result.aggregations.sum_of_packets['value'])) + ', ' +\ str(int(result.aggregations.sum_of_bytes['value'])) json_response = '{"status": "Ok", "data": "' + data + '"}' return json_response except Exception as e: json_response = '{"status": "Error", "data": "Elasticsearch query exception: ' + escape(str(e)) + '"}' return json_response
def query_geonames_country(self, placename, country): """ Like query_geonames, but this time limited to a specified country. """ # first, try for an exact phrase match q = {"multi_match": {"query": placename, "fields": ['name^5', 'asciiname^5', 'alternativenames'], "type" : "phrase"}} r = Q("match", country_code3=country) res = self.conn.query(q).query(r)[0:50].execute() # if no results, use some fuzziness, but still require all terms to be present. # Fuzzy is not allowed in "phrase" searches. if res.hits.total == 0: # tried wrapping this in a {"constant_score" : {"query": ... but made it worse q = {"multi_match": {"query": placename, "fields": ['name', 'asciiname', 'alternativenames'], "fuzziness" : 1, "operator": "and"}, } r = Q("match", country_code3=country) res = self.conn.query(q).query(r)[0:50].execute() out = utilities.structure_results(res) return out
def query_missing(s, field, name, methods=None, responsecodes=None, invert=False): # main query q = Q("match", ** { field: name }) if not invert: q = ~q s.query = q # add filters ## method if methods: s = s.filter("terms", ** { 'request.method': methods }) ## response codes if responsecodes: for rc in responsecodes: rcrange = rc.split("-") if len(rcrange) == 2: s = s.filter("range", ** { 'response.status': { "gte": int(rcrange[0]), "lte": int(rcrange[1]) } }) else: s = s.filter("term", ** { 'response.status': rc }) print_debug(s.to_dict()) return s
def query_vals(s, field, name, values, invert): # match documents where given field value name is present, if required if values: q = Q("nested", path=field, query=Q("wildcard", ** { field + ".value.keyword": values })) if invert: s.query = ~q else: s.query = q else: s.query = Q() # 1. descent into response.headers/request.parameters # 2. filter given header # 3. aggregate values # 4. jump back into main document # 5. aggregate URLs s.aggs.bucket("field", "nested", path=field)\ .bucket("valuefilter", "filter", Q("match", ** { field + ".name": name }))\ .bucket("values", "terms", field=field + ".value.keyword", size=args.size)\ .bucket("main", "reverse_nested")\ .bucket("urls", "terms", field="request.url.keyword", size=args.size) return s
def about(request): """Information about the current site, its goals, and what content is loaded""" # Provider counts providers = cache.get_or_set(CACHE_STATS_NAME, [], CACHE_STATS_DURATION) if not providers: for provider in sorted(settings.PROVIDERS.keys()): s = Search() q = Q('term', provider=provider) s = s.query(q) response = s.execute() if response.hits.total > 0: data = settings.PROVIDERS[provider] total = intcomma(response.hits.total) data.update({'hits': total}) providers.append(data) # All results s = Search() response = s.execute() total = intcomma(response.hits.total) providers.append({'display_name': 'Total', 'hits': total}) cache.set(CACHE_STATS_NAME, providers) return render(request, "about.html", {'providers': providers})
def correct_orphan_records(self, provider='europeana', end=None): """[#185] Delete records from the search engine which aren't found in the database""" s = Search() q = Q('term', provider=provider) s = s.query(q) response = s.execute() total = response.hits.total # A file extracted from the production database listing all of the europeana identifiers identifier_file = '/tmp/europeana-identifiers.json' db_identifiers = set(json.load(open(identifier_file))) total_in_db = len(db_identifiers) log.info("Using search engine instance %s", settings.ELASTICSEARCH_URL) log.info("Total records: %d (search engine), %d (database) [diff=%d]", total, total_in_db, total - total_in_db) deleted_count = 0 for r in s.scan(): if r.identifier not in db_identifiers: img = search.Image.get(id=r.identifier) log.debug("Going to delete image %s", img) deleted_count += 1 log.info("Deleted %d from search engine", deleted_count)
def aggregate(self, search): """ Add aggregations representing the facets selected, including potential filters. """ for f, facet in iteritems(self.facets): agg = facet.get_aggregation() agg_filter = esd.Q('match_all') for field, filter in iteritems(self._filters): if f == field or (f.startswith("date") and field.startswith("date")): continue agg_filter &= filter search.aggs.bucket( '_filter_' + f, 'filter', filter=agg_filter ).bucket(f, agg)
def build(self): fs = self._clone() for facet in self.facets: if "include_%s" % facet.name not in self.args: continue agg_filter = esd.Q("match_all") for inner in self.facets: if inner.name != facet.name: if inner.is_filtered(self.args): agg_filter &= inner.filters(self.args) for agg_name, agg in facet.aggregates(): fs.aggs.bucket("_filter_" + agg_name, "filter", filter=agg_filter).bucket(agg_name, agg) post_filter = esd.Q('match_all') for facet in self.facets: if facet.is_filtered(self.args): post_filter &= facet.filters(self.args) fs.post_filter._proxied &= post_filter return fs
def tag_by_email(self, emails, breached): docs = [] s = Search(using=self.es).\ filter(Q({'terms': {'contact_email.keyword': emails}})).\ source(['id_submission']) print('%s emails breached=%s' % (len(emails), breached)) for hit in s.scan(): docs.append(lib.bulk_update_doc(hit['id_submission'], {'breached': breached})) if not len(docs) % 500: print('\tfetched %s' % len(docs)) print('\t%s matches' % len(docs)) return docs
def run(self): emails = { 'breached': set(), 'unbreached': set(), } # contact_email exists must = [Q('exists', field='contact_email')] # matches source if specified if self.source: must.append(Q({'term': {'analysis.source': self.source}})) # not already tagged with breached s = Search(using=self.es).\ query(FunctionScore( query=Q('bool', must=must, must_not=[Q('exists', field='analysis.breached')]), functions=[SF('random_score', seed=int(time.time()))] )).\ source(['contact_email']) print('%s breached: source=%s limit=%s' % (datetime.now().isoformat(), self.source, self.limit)) print('query=\n%s' % json.dumps(s.to_dict())) for filing in s[:self.limit]: email = filing['contact_email'] if not email or email in emails['breached'] or email in emails['unbreached']: continue breached = self.is_breached(email) emails['breached' if breached else 'unbreached'].add(email) docs = [] print('done source=%s' % self.source) if emails['breached']: docs += self.tag_by_email(list(emails['breached']), True) if emails['unbreached']: docs += self.tag_by_email(list(emails['unbreached']), False) try: lib.bulk_update(self.es, docs) except Exception as e: print('error indexing: %s' % e)
def monitor(index, delta, query_string): click.clear() def cnt(): q = Q('query_string', query=query_string) s = Search( using=es.client, index=index).query(q) return s.count() N = cnt() tot = Search(using=es.client, index=index).count() if not delta: N = tot log.info('Processing %d records (total: %d)', N, tot) click.echo('You can exit by CTRL-C: results will still process') bar = SlowOverallFancyBar('', max=N, grand_total=tot) while True: time.sleep(5.0) try: n = cnt() if isinstance(n, int): if delta: done = N - n else: done = n bar.goto(done) except Exception as e: log.warn('Cannot count: %s', e) bar.finish()
def search(self, **kwargs): q = kwargs.get('q', '*') sort = kwargs.get('sort', 'timestamp') search_after = kwargs.get('search_after') size = kwargs.get('size', 50) source = kwargs.get('source') extra = dict( size=size) if search_after: extra.update(dict(search_after=search_after)) s = Search(using=self.client, index=self.index_name) if source: s = s.source(source) s = s.sort(sort) s = s.query(Q('query_string', query=q)) s = s.extra(**extra) log.info('Query: %s', s.to_dict()) r = s.execute() count = r.hits.total took = r.took result = r, count, took return result
def search_reports(state, must_terms, should_terms): s = Report.search() q = Q('bool', must=[Q('match', body=term) for term in must_terms], should=[Q('match', body=term) for term in should_terms], minimum_should_match=1 ) s = s.filter('terms', state=[state]).query(q) response = s.execute() return response.to_dict()['hits']['hits']
def test_create_search_obj_filter(self, is_advance_search_capable): """ Test that Search objects are created with program-limiting and filled_out=True query parameters """ user = self.user if is_advance_search_capable else self.learner search_obj = create_search_obj(user) search_query_dict = search_obj.to_dict() expected_program_query = Q( 'bool', should=[ Q('term', **{'program.id': self.program.id}) ], minimum_should_match=1, must=[ Q('term', **{'program.is_learner': True}) ] ) expected_filled_out_query = Q('term', **{'profile.filled_out': True}) expected_privacy_query = ~Q('term', **{'profile.account_privacy': 'private'}) assert 'query' in search_query_dict assert 'bool' in search_query_dict['query'] assert 'filter' in search_query_dict['query']['bool'] assert len(search_query_dict['query']['bool']['filter']) == 2 if is_advance_search_capable else 3 expected_filters = [ expected_program_query.to_dict(), expected_filled_out_query.to_dict(), ] if not is_advance_search_capable: expected_filters.insert(0, expected_privacy_query.to_dict()) assert search_query_dict['query']['bool']['filter'] == expected_filters
def create_program_limit_query(user, staff_program_ids, filter_on_email_optin=False): """ Constructs and returns a query that limits a user to data for their allowed programs Args: user (django.contrib.auth.models.User): A user staff_program_ids (list of int): the list of program ids the user is staff for if any filter_on_email_optin (bool): If true, filter out profiles where email_optin != true Returns: elasticsearch_dsl.query.Q: An elasticsearch query """ users_allowed_programs = get_searchable_programs(user, staff_program_ids) # if the user cannot search any program, raise an exception. # in theory this should never happen because `UserCanAdvanceSearchPermission` # takes care of doing the same check, but better to keep it to avoid # that a theoretical bug exposes all the data in the index if not users_allowed_programs: raise NoProgramAccessException() must = [ Q('term', **{'program.is_learner': True}) ] if filter_on_email_optin: must.append(Q('term', **{'profile.email_optin': True})) # no matter what the query is, limit the programs to the allowed ones # if this is a superset of what searchkit sends, this will not impact the result return Q( 'bool', should=[ Q('term', **{'program.id': program.id}) for program in users_allowed_programs ], # require that at least one program id matches the user's allowed programs minimum_should_match=1, must=must, )
def query(s, q): s.query = Q("query_string", query=q) return s ### Main ###
def get_es_query(self, s_query, s_fields, **kwargs): """ Create and return elasticsearch Query. You could overload this method for creating your custom search Query object. Arguments: s_query: request search param s_fields: search fields Keyword arguments: request: request object view: view object """ return Q("multi_match", query=s_query, fields=s_fields)
def elasticsearch_retrieve_page_by_id(page_id): query = Search().filter(Q("term", nid=int(page_id)))[:1] result = query.execute() if result.hits.total == 0: return None return result.hits[0]
def elasticsearch_delete_old(): _from = NEVER _to = datetime.now() - timedelta(days=30) query = Search().filter(Q("range", visited_at={'from': _from, 'to': _to})) result = query.delete()
def elasticsearch_pages(context, sort, page): result_limit = int(os.environ['RESULT_LIMIT']) max_result_limit = int(os.environ['MAX_RESULT_LIMIT']) start = (page - 1) * result_limit end = start + result_limit domain_query = Q("term", is_banned=False) if context["is_up"]: domain_query = domain_query & Q("term", is_up=True) if not context["show_fh_default"]: domain_query = domain_query & Q("term", is_crap=False) if not context["show_subdomains"]: domain_query = domain_query & Q("term", is_subdomain=False) if context["rep"] == "genuine": domain_query = domain_query & Q("term", is_genuine=True) if context["rep"] == "fake": domain_query = domain_query & Q("term", is_fake=True) limit = max_result_limit if context["more"] else result_limit has_parent_query = Q("has_parent", type="domain", query=domain_query) if context['phrase']: query = Search().filter(has_parent_query).query(Q("match_phrase", body_stripped=context['search'])) else: query = Search().filter(has_parent_query).query(Q("match", body_stripped=context['search'])) query = query.highlight_options(order='score', encoder='html').highlight('body_stripped')[start:end] query = query.source(['title','domain_id','created_at', 'visited_at']).params(request_cache=True) if context["sort"] == "onion": query = query.sort("_parent") elif context["sort"] == "visited_at": query = query.sort("-visited_at") elif context["sort"] == "created_at": query = query.sort("-created_at") elif context["sort"] == "last_seen": query = query.sort("-visited_at") return query.execute()
def test_query(self): """It should be possible to query the search engine for results""" q = Q("match", title="greyhounds") s = self.s.query(q) r = s.execute() self.assertEqual(0, r.hits.total) # We haven't indexed anything, so no results are expected
def test_search(self): """It should be possible to find an item by query""" self._index_img(self.img1) s = self.s.query(Q("match", title="greyhounds")) r = s.execute() self.assertEquals(1, r.hits.total)
def test_remove_from_search_after_sync(self): """When an image is removed from the source, it should be removed from the search engine""" self._index_img(self.removed) s = self.s.query(Q("match", title="removed")) r = s.execute() self.assertEquals(1, r.hits.total) with responses.RequestsMock() as rsps: rsps.add(responses.HEAD, FOREIGN_URL + TEST_IMAGE_REMOVED, status=404) self.removed.sync() signals._update_search_index(self.removed) self.es.indices.refresh() s = self.s.query(Q("match", title="removed")) r = s.execute() self.assertEquals(0, r.hits.total)
def get(self, handler): # Rip the search object out of the elasticsearch backend sort = handler.sort search = self.collection._state._backend.raw_backend().search if handler.request.query_arguments.get('q'): search = search.query(elasticsearch_dsl.Q('query_string', query=handler.request.query_arguments['q'][-1].decode('utf-8'))) else: # This should technically be elsewhere but the search object # does not provide a nice way to figure out if there is a query or not. search = search.sort({'ref': { 'order': 'asc', 'unmapped_type': 'string' }}) if handler.request.query_arguments.get('sort'): search = search.sort({sort.key: { 'order': 'asc' if sort.order == 1 else 'desc', 'unmapped_type': 'string' }}) # Hacking into the serializer handler._serializer = self.get_serializer() handler._view.parents = handler._view.parents + (self.collection,) start = handler.page * handler.page_size wrapper = SearchResultWrapper(search[start:start + handler.page_size]) return handler.write({ 'meta': { 'total': wrapper.count(), 'perPage': handler.page_size }, # TODO 'links': {}, 'data': [handler.serialize(resource) for resource in wrapper] })
def get_results(search_object, searched_fields, value): search_query = value if not search_query: q = Q(query=None) else: q = Q("multi_match", query=search_query, fields=searched_fields, operator='and') search_object = search_object.query(q)[0:SEARCH_RESULTS_PER_PAGE] if search_query: for searched_field in searched_fields: search_object = search_object.highlight(searched_field) return search_object
def add_filters(self, filters, regexp=False, negative=False): """ Add `filters` to the query. `filters is a dict of the form {'field': value, field2: value2}, but you can also use a list of values instead of a `str`. They'll be added as a _or_ (and not a _and_). :param dict filters: :param bool regexp: :param bool negative: :return: """ # We need to use multi_match, since we get the fields names dynamically. for key, value in filters.items(): if isinstance(value, set): value = list(value) # There is no need to process empty values. if not value: continue if isinstance(value, list): if negative: self.search = self.search.query(Q('bool', must_not=[ reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])]) ) else: self.search = self.search.query(Q('bool', must=[ reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])]) ) else: if negative: self.search = self.search.query(~Q("multi_match", query=value, fields=[key])) else: self.search = self.search.query(Q("multi_match", query=value, fields=[key]))
def filters(self, args): if self.name in args and args[self.name] == "": return esd.Q('missing', field=self.name) else: return esd.Q('terms', **{self.name : [args.get(self.name), ]})
def filters(self, args): if self.name in args and len(self.name) > 0: return esd.Q('prefix', **{self.name : args.get(self.name), }) else: return super().filters(args)
def filters(self): range = {} if self.value[0] is not None: range["from"] = self.value[0] if self.value[1] is not None: range["to"] = self.value[1] return esd.Q('range', **{self.name : range})
def get_dsl_logoff_query(screen): q = None for evtid in config.EVENTS_LOGOFF: tmp = Q("match",event_identifier=evtid) if q is None: q = tmp else: q = q | tmp if screen is True: for evtid in config.EVENTS_LOGOFF_SCREEN: q = q | Q("match",event_identifier=evtid) return q
def get_dsl_logon_query(screen): q = None for evtid in config.EVENTS_LOGON: tmp = Q("match",event_identifier=evtid) if q is None: q = tmp else: q = q | tmp if screen is True: for evtid in config.EVENTS_LOGON_SCREEN: q = q | Q("match",event_identifier=evtid) return q
def get_logout_event(index,logonid,timestamp,maxtstamp,screen): """ Look for the logoff event belonging to the given logon id or a shutdown event. """ conn = connections.get_connection() # workaround to fix time presition issues timestamp = timestamp - 999 logoff = get_dsl_logoff_query(screen) q = [ \ Q('match',data_type='windows:evtx:record') , \ Q('match',xml_string=logonid) , \ logoff \ ] s = Search(using=conn, index=index).query(Q('bool',must=q)).filter('range',datetime={'gte':timestamp,'lte':maxtstamp}).sort('-datetime') res = s.execute() try: evt = res[0] except: evt = None if evt is None: q = [ Q('match',event_identifier=config.EVENT_SHUTDOWN) ] s = Search(using=conn, index=index).query(Q('bool',must=q)).filter('range',datetime={'gte':timestamp,'lte':maxtstamp}).sort('-datetime') res = s.execute() try: evt = res[0] except: evt = None return evt
def get_last_shutdown(index,maxtstamp,pattern): """ Look for the last shutdown event """ conn = connections.get_connection() q = [ \ Q('match',data_type='windows:evtx:record') , \ Q('match',event_identifier=config.EVENT_SHUTDOWN) ] if pattern: q.append(Q('query_string',query=pattern,analyze_wildcard=True)) s = Search(using=conn, index=index).query(Q('bool',must=q)).filter('range',datetime={'lte':maxtstamp}).sort('-datetime')[0:0] s.aggs.bucket('computer','terms',field='computer_name.keyword').bucket('shutdown','top_hits',size=1) res = s.execute() ret = {} for item in res.aggregations['computer']['buckets']: ret[item['key']] = item['shutdown']['hits']['hits'][0] if len(ret.keys()) == 0: ret = None return ret
def get_last_event(index,computer=None,maxdate=None,pattern=None): conn = connections.get_connection() q = [ \ Q('match',data_type='windows:evtx:record') ] if computer is not None: q.append(Q('match',computer_name=computer)) if pattern: q.append(Q('query_string',query=pattern,analyze_wildcard=True)) if maxdate: s = Search(using=conn, index=index).query(Q('bool',must=q)).filter('range',datetime={'lte': maxdate}).sort('-datetime') else: s = Search(using=conn, index=index).query(Q('bool',must=q)).sort('-datetime') if computer is None: s = s[0:0] s.aggs.bucket('computer','terms',field='computer_name.keyword').bucket('last','top_hits',size=1) res = s.execute() if computer is None: evt = {} for item in res.aggregations['computer']['buckets']: evt[item['key']] = item['last']['hits']['hits'][0] if len(evt.keys()) == 0: evt = None else: try: evt = res[0] except: evt = None return evt
def create_query_for_email(self, search, email): return search.query(elasticsearch_dsl.Q({"match": {'email': email}}))
def paginate(self, index, q='*', limit=None, size=None, id_only=True): if not size: size = self.bulk_size log.info('Limit %s, size %s (q = "%s")', limit, size, q) s = Search( using=self.client, index=index, doc_type=self.doc_type) s = s.query(Q('query_string', query=q)) if limit: size = min(size, limit) s = s.extra(size=size) s = s.params( scroll='20m', size=size) if id_only: s = s.source(False) log.debug('Query: %s', simplejson.dumps(s.to_dict(), indent=2)) hits = [] overall = 0 for h in s.scan(): if limit is not None and overall >= limit: raise StopIteration() log.debug('Hit: %s (progress: %d)', h.meta.id, overall) if overall < limit or not limit: if id_only: hits.append(h.meta.id) else: hits.append(h.to_dict()) if len(hits) == size: yield iter(hits) hits = [] overall += size if len(hits): yield iter(hits) else: raise StopIteration()
def get_records_list(): """ Obtains list of all records for given type given time range. :return: JSON with status "ok" or "error" and requested data. """ # Check login if not session.logged: json_response = '{"status": "Error", "data": "You must be logged!"}' return json_response # Check mandatory inputs if not (request.get_vars.beginning and request.get_vars.end and request.get_vars.type): json_response = '{"status": "Error", "data": "Some mandatory argument is missing!"}' return json_response # Parse inputs and set correct format beginning = escape(request.get_vars.beginning) end = escape(request.get_vars.end) type = escape(request.get_vars.type) try: # Elastic query client = elasticsearch.Elasticsearch( [{'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port')}]) elastic_bool = [] elastic_bool.append({'range': {'@timestamp': {'gte': beginning, 'lte': end}}}) elastic_bool.append({'term': {'@stat_type': type}}) # Prepare query qx = Q({'bool': {'must': elastic_bool}}) # Set query according to the statistic type search_ip = Search(using=client, index='_all').query(qx) search_ip.aggs.bucket('all_nested', 'nested', path='data_array')\ .bucket('by_key', 'terms', field='data_array.key.raw', size=2147483647)\ .bucket('stats_sum', 'sum', field='data_array.value') results = search_ip.execute() data = "" for all_buckets in results.aggregations.all_nested.by_key: data += all_buckets.key + "," + str(int(all_buckets.stats_sum.value)) + "," # Remove trailing comma data = data[:-1] json_response = '{"status": "Ok", "data": "' + data + '"}' return json_response except Exception as e: json_response = '{"status": "Error", "data": "Exception: ' + escape(str(e)) + '"}' return json_response
def get_host_flows(): """ Gets flows, packet and bytes time series for a given host Returns: JSON with status "ok" or "error" and requested data. """ # Check login if not session.logged: json_response = '{"status": "Error", "data": "You must be logged!"}' return json_response # Check mandatory inputs if not (request.get_vars.beginning and request.get_vars.end and request.get_vars.aggregation and request.get_vars.host_ip): json_response = '{"status": "Error", "data": "Some mandatory argument is missing!"}' return json_response # Parse inputs and set correct format beginning = escape(request.get_vars.beginning) end = escape(request.get_vars.end) aggregation = escape(request.get_vars.aggregation) host_ip = escape(request.get_vars.host_ip) try: # Elastic query client = elasticsearch.Elasticsearch( [{'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port')}]) elastic_bool = [] elastic_bool.append({'range': {'@timestamp': {'gte': beginning, 'lte': end}}}) elastic_bool.append({'term': {'src_ip': host_ip}}) qx = Q({'bool': {'must': elastic_bool}}) s = Search(using=client, index='_all').query(qx) s.aggs.bucket('by_time', 'date_histogram', field='@timestamp', interval=aggregation) \ .metric('sum_of_flows', 'sum', field='stats.total.flow') \ .metric('sum_of_packets', 'sum', field='stats.total.packets') \ .metric('sum_of_bytes', 'sum', field='stats.total.bytes') result = s.execute() data = "Timestamp,Number of flows,Number of packets,Number of bytes;" for record in result.aggregations.by_time.buckets: timestamp = record.key number_of_flows = int(record.sum_of_flows.value) number_of_packets = int(record.sum_of_packets.value) number_of_bytes = int(record.sum_of_bytes.value) data += str(timestamp) + "," + str(number_of_flows) + "," + str(number_of_packets) + "," + str(number_of_bytes) + ";" json_response = '{"status": "Ok", "data": "' + data + '"}' return (json_response) except Exception as e: json_response = '{"status": "Error", "data": "Elasticsearch query exception: ' + escape(str(e)) + '"}' return json_response
def get_host_distinct_ports(): """ Gets flows, packet and bytes time series for a given host Returns: JSON with status "ok" or "error" and requested data. """ # Check login if not session.logged: json_response = '{"status": "Error", "data": "You must be logged!"}' return json_response # Check mandatory inputs if not (request.get_vars.beginning and request.get_vars.end and request.get_vars.aggregation and request.get_vars.host_ip): json_response = '{"status": "Error", "data": "Some mandatory argument is missing!"}' return json_response # Parse inputs and set correct format beginning = escape(request.get_vars.beginning) end = escape(request.get_vars.end) aggregation = escape(request.get_vars.aggregation) host_ip = escape(request.get_vars.host_ip) try: # Elastic query client = elasticsearch.Elasticsearch( [{'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port')}]) elastic_bool = [] elastic_bool.append({'range': {'@timestamp': {'gte': beginning, 'lte': end}}}) elastic_bool.append({'term': {'src_ip': host_ip}}) qx = Q({'bool': {'must': elastic_bool}}) s = Search(using=client, index='_all').query(qx) s.aggs.bucket('by_time', 'date_histogram', field='@timestamp', interval=aggregation) \ .metric('dport_avg', 'avg', field='stats.dport_count') \ .metric('dport_max', 'max', field='stats.dport_count') \ .metric('dport_min', 'min', field='stats.dport_count') result = s.execute() data_avg = [] data_min_max = [] data_max = [] data_min = [] for record in result.aggregations.by_time.buckets: timestamp = record.key maximum = round(record.dport_max.value, 2) if record.dport_max.value else None minimum = round(record.dport_min.value, 2) if record.dport_min.value else None data_avg.append([timestamp,round(record.dport_avg.value,2) if record.dport_avg.value else None]) data_min_max.append([timestamp,[minimum, maximum ]]) data_max.append(maximum) data_min.append(minimum) json_response = {"status": "Ok", "data":{ "data_avg": data_avg, "data_min_max": data_min_max, "data_min": data_min, "data_max": data_max}} return (json.dumps(json_response)) except Exception as e: json_response = '{"status": "Error", "data": "Elasticsearch query exception: ' + escape(str(e)) + '"}' return json_response
def get_host_distinct_peers(): """ Gets flows, packet and bytes time series for a given host Returns: JSON with status "ok" or "error" and requested data. """ # Check login if not session.logged: json_response = '{"status": "Error", "data": "You must be logged!"}' return json_response # Check mandatory inputs if not (request.get_vars.beginning and request.get_vars.end and request.get_vars.aggregation and request.get_vars.host_ip): json_response = '{"status": "Error", "data": "Some mandatory argument is missing!"}' return json_response # Parse inputs and set correct format beginning = escape(request.get_vars.beginning) end = escape(request.get_vars.end) aggregation = escape(request.get_vars.aggregation) host_ip = escape(request.get_vars.host_ip) try: # Elastic query client = elasticsearch.Elasticsearch( [{'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port')}]) elastic_bool = [] elastic_bool.append({'range': {'@timestamp': {'gte': beginning, 'lte': end}}}) elastic_bool.append({'term': {'src_ip': host_ip}}) qx = Q({'bool': {'must': elastic_bool}}) s = Search(using=client, index='_all').query(qx) s.aggs.bucket('by_time', 'date_histogram', field='@timestamp', interval=aggregation) \ .metric('peer_avg', 'avg', field='stats.peer_number') \ .metric('peer_max', 'min', field='stats.peer_number') \ .metric('peer_min', 'max', field='stats.peer_number') result = s.execute() data_avg = [] data_min_max=[] data_max = [] data_min = [] for record in result.aggregations.by_time.buckets: timestamp = record.key maximum = round(record.peer_max.value, 2) if record.peer_max.value else None minimum = round(record.peer_min.value, 2) if record.peer_min.value else None data_avg.append([timestamp, round(record.peer_avg.value, 2) if record.peer_avg.value else None]) data_min_max.append([timestamp, [minimum, maximum]]) data_max.append(maximum) data_min.append(minimum) json_response = {"status": "Ok", "data":{ "data_avg": data_avg, "data_min_max": data_min_max, "data_min": data_min, "data_max": data_max}} return (json.dumps(json_response)) except Exception as e: json_response = '{"status": "Error", "data": "Elasticsearch query exception: ' + escape(str(e)) + '"}' return json_response
def get_records_list(): """ Obtains list of all records for given type given time range. :return: JSON with status "ok" or "error" and requested data. """ # Check login if not session.logged: json_response = '{"status": "Error", "data": "You must be logged!"}' return json_response # Check mandatory inputs if not (request.get_vars.beginning and request.get_vars.end and request.get_vars.filter): json_response = '{"status": "Error", "data": "Some mandatory argument is missing!"}' return json_response # Parse inputs and set correct format beginning = escape(request.get_vars.beginning) end = escape(request.get_vars.end) filter = escape(request.get_vars.filter) try: # Elastic query client = elasticsearch.Elasticsearch( [{'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port')}]) elastic_bool = [] elastic_bool.append({'range': {'@timestamp': {'gte': beginning, 'lte': end}}}) elastic_bool.append({'term': {'@type': 'external_dns_resolver'}}) # Set filter if filter != 'none': elastic_should = [] elastic_should.append({'term': {'src_ip': filter}}) elastic_should.append({'term': {'resolver_ip.raw': filter}}) elastic_bool.append({'bool': {'should': elastic_should}}) qx = Q({'bool': {'must': elastic_bool}}) # Search with maximum size aggregations search = Search(using=client, index='_all').query(qx) search.aggs.bucket('by_src', 'terms', field='src_ip', size=2147483647)\ .bucket('by_dst', 'terms', field='resolver_ip.raw', size=2147483647)\ .bucket('top_src_dst', 'top_hits', size=1, sort=[{'timestamp': {'order': 'desc'}}]) results = search.execute() # Result Parsing into CSV in format: timestamp, source_ip, resolver_ip, flows data = "" for src_aggregations in results.aggregations.by_src.buckets: for result in src_aggregations.by_dst.buckets: record = result.top_src_dst.hits.hits[0]["_source"] data += record["timestamp"].replace("T", " ").replace("Z", "") + "," + record["src_ip"] + "," \ + record["resolver_ip"] + "," + str(record["flows"]) + "," data = data[:-1] json_response = '{"status": "Ok", "data": "' + data + '"}' return json_response except Exception as e: json_response = '{"status": "Error", "data": "Exception: ' + escape(str(e)) + '"}' return json_response
def query_geonames(self, placename): """ Wrap search parameters into an elasticsearch query to the geonames index and return results. Parameters --------- conn: an elasticsearch Search conn, like the one returned by `setup_es()` placename: str the placename text extracted by NER system Returns ------- out: The raw results of the elasticsearch query """ # first first, try for country name if self.is_country(placename): q = {"multi_match": {"query": placename, "fields": ['name', 'asciiname', 'alternativenames'], "type" : "phrase"}} r = Q("match", feature_code='PCLI') res = self.conn.query(q).query(r)[0:5].execute() # always 5 #self.country_exact = True else: # second, try for an exact phrase match q = {"multi_match": {"query": placename, "fields": ['name^5', 'asciiname^5', 'alternativenames'], "type" : "phrase"}} res = self.conn.query(q)[0:50].execute() # if no results, use some fuzziness, but still require all terms to be present. # Fuzzy is not allowed in "phrase" searches. if res.hits.total == 0: # tried wrapping this in a {"constant_score" : {"query": ... but made it worse q = {"multi_match": {"query": placename, "fields": ['name', 'asciiname', 'alternativenames'], "fuzziness" : 1, "operator": "and"}, } #self.fuzzy = True # idea was to preserve this info as a feature, but not using state like this res = self.conn.query(q)[0:50].execute() es_result = utilities.structure_results(res) return es_result
def create_search_obj(user, search_param_dict=None, filter_on_email_optin=False): """ Creates a search object and prepares it with metadata and query parameters that we want to apply for all ES requests Args: user (User): User object search_param_dict (dict): A dict representing the body of an ES query filter_on_email_optin (bool): If true, filter out profiles where email_optin != True Returns: Search: elasticsearch_dsl Search object """ staff_program_ids = get_advance_searchable_program_ids(user) is_advance_search_capable = bool(staff_program_ids) search_obj = Search(index=get_default_alias(), doc_type=_get_search_doc_types(is_advance_search_capable)) # Update from search params first so our server-side filtering will overwrite it if necessary if search_param_dict is not None: search_obj.update_from_dict(search_param_dict) if not is_advance_search_capable: # Learners can't search for other learners with privacy set to private search_obj = search_obj.filter( ~Q('term', **{'profile.account_privacy': Profile.PRIVATE}) ) # Limit results to one of the programs the user is staff on search_obj = search_obj.filter(create_program_limit_query( user, staff_program_ids, filter_on_email_optin=filter_on_email_optin )) # Filter so that only filled_out profiles are seen search_obj = search_obj.filter( Q('term', **{'profile.filled_out': True}) ) # Force size to be the one we set on the server update_dict = {'size': settings.ELASTICSEARCH_DEFAULT_PAGE_SIZE} if search_param_dict is not None and search_param_dict.get('from') is not None: update_dict['from'] = search_param_dict['from'] search_obj.update_from_dict(update_dict) return search_obj
def index(self, query=None, radius=None, center=None, sort_by=None, *args, **kwargs): if not query and not radius and not center: redirect('/jobs') search_query = JobElastic().search() relevance_sort = sort_by == 'scores' if query: keyword_queries = self._compute_keyword_queries(query) decay_functions = self._compute_decay_functions() search_query.query = Q( 'function_score', query=keyword_queries, functions=decay_functions ) else: relevance_sort = False try: geoloc_query = json.loads(center) coordinates = geoloc_query['coordinates'] lat, lon = (coordinates['lat'], coordinates['lon']) except (ValueError, TypeError): # One of the following case has occurred: # - Center wasn't a valid json string # - Radius couldn't be converted to float # Since both these information are required to set a geolocation # filter are required, we ignore it. pass else: search_query = self._apply_geolocation_filters( search_query, (lat, lon), radius if radius else 5.0) date_sort = not relevance_sort if date_sort: search_query = self._apply_date_sort(search_query) return dict(sources=SOURCES, jobs=PaginatedSearch(search_query), job_offer_search_form=JobsResearchForm)
def get_subscribers(self, targetings, hours_whitelist, volume): logger.info("SubscriberService.get_subscribers: getting subscribers") start_time = time.time() timezones = [tz for tz in pytz.all_timezones if (datetime .now(pytz.timezone(tz)).hour in hours_whitelist)] targetings.append({ "field": "unsub", "operator": "NOT IN", "values": [1, "true"] }) if timezones: targetings.append({ "field": "timezone", "operator": "IN", "values": timezones }) s = Search(using=es, index="users") operator_mappings = { 'IN': 'must', 'NOT IN': 'must_not', } q = Q() for condition in targetings: condition_pair = {condition["field"]: condition["values"]} terms_q = Q('terms', **condition_pair) bool_operator = operator_mappings[condition['operator']] bool_q = Q('bool', **{bool_operator: terms_q}) q += bool_q s = s.query(q) s.query = dslq.FunctionScore( query=s.query, functions=[dslq.SF('random_score')], boost_mode="replace" ) s = s[:volume] try: res = s.execute() except Exception as e: logger.error(f"SubscriberService.get_subscribers: Exception {e}") else: subscribers = [] for row in res.hits: subscriber = row.to_dict() subscriber['_id'] = row.meta.id subscribers.append(subscriber) end_time = time.time() logger.debug(f"SubscriberService.get_subscribers: finished in " f"{int((end_time - start_time) * 1000)}ms") return subscribers