我有很多大的(> 35,000,000)整数列表,其中将包含重复项。我需要获取列表中每个整数的计数。以下代码有效,但似乎很慢。还有人可以使用Python最好是Numpy更好地进行基准测试吗?
def group(): import numpy as np from itertools import groupby values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4') values.sort() groups = ((k,len(list(g))) for k,g in groupby(values)) index = np.fromiter(groups,dtype='u4,u2') if __name__=='__main__': from timeit import Timer t = Timer("group()","from __main__ import group") print t.timeit(number=1)
返回:
$ python bench.py 111.377498865
干杯!
*根据回复进行 *编辑 :
def group_original(): import numpy as np from itertools import groupby values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4') values.sort() groups = ((k,len(list(g))) for k,g in groupby(values)) index = np.fromiter(groups,dtype='u4,u2') def group_gnibbler(): import numpy as np from itertools import groupby values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4') values.sort() groups = ((k,sum(1 for i in g)) for k,g in groupby(values)) index = np.fromiter(groups,dtype='u4,u2') def group_christophe(): import numpy as np values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4') values.sort() counts=values.searchsorted(values, side='right') - values.searchsorted(values, side='left') index = np.zeros(len(values),dtype='u4,u2') index['f0']=values index['f1']=counts #Erroneous result! def group_paul(): import numpy as np values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4') values.sort() diff = np.concatenate(([1],np.diff(values))) idx = np.concatenate((np.where(diff)[0],[len(values)])) index = np.empty(len(idx)-1,dtype='u4,u2') index['f0']=values[idx[:-1]] index['f1']=np.diff(idx) if __name__=='__main__': from timeit import Timer timings=[ ("group_original","Original"), ("group_gnibbler","Gnibbler"), ("group_christophe","Christophe"), ("group_paul","Paul"), ] for method,title in timings: t = Timer("%s()"%method,"from __main__ import %s"%method) print "%s: %s secs"%(title,t.timeit(number=1))
$ python bench.py Original: 113.385262966 secs Gnibbler: 71.7464978695 secs Christophe: 27.1690568924 secs Paul: 9.06268405914 secs
尽管Christophe目前给出的结果不正确
我做类似这样的事情得到了3倍的改进:
def group(): import numpy as np values = np.array(np.random.randint(0,3298,size=35000000),dtype='u4') values.sort() dif = np.ones(values.shape,values.dtype) dif[1:] = np.diff(values) idx = np.where(dif>0) vals = values[idx] count = np.diff(idx)