python elasticsearch 深度分页——scroll的使用与清除,clear_scroll

网上的大部教程都讲到了elasticsearch使用scroll游标的方法,但使用后往往没有清除游标,这会造成scroll超过最大数量的限制而报错,应该在任务结束时去手动清理scroll(否则只能等到设定的时间后游标才会自动清理)

from elasticsearch import Elasticsearch


def main():
    es = Elasticsearch([***], http_auth = ('***', '****'), port = *** )
    query = ***
    page = es.search(
                index= ** *,
                scroll = '2m',
                size = 1000,
                body = {"query": query})
    sid = page['_scroll_id']
    sid_list = [sid]
    scroll_size_max = page['hits']['total']['value']
    cnt = 0
    while cnt < scroll_size_max:
        for info in page['hits']['hits']:
            # do something
            cnt += 1
        page = es.scroll(scroll_2m')
        sid = page['_scroll_id']
        sid_list.append(sid)
    for sid_del in sid_list:
        es.clear_scroll(scroll_id=sid_del)

if __name__ == "__main__":
    main()