我正在尝试使用O(n)复杂度的算法查找任何给定文本中每个符号的频率。我的算法如下:
s = len(text) P = 1.0/s freqs = {} for char in text: try: freqs[char]+=P except: freqs[char]=P
但我怀疑这种字典方法是否足够快,因为它取决于字典方法的基础实现。这是最快的方法吗?
更新:如果使用集合和整数,则速度不会增加。这是因为该算法已经具有O(n)的复杂度,因此根本不可能进行加速。
例如,1MB文本的结果:
without collections: real 0m0.695s with collections: real 0m0.625s
注意:表中的时间不包括加载文件所需的时间。
| approach | american-english, | big.txt, | time w.r.t. defaultdict | | | time, seconds | time, seconds | | |----------------+-------------------+---------------+-------------------------| | Counter | 0.451 | 3.367 | 3.6 | | setdefault | 0.348 | 2.320 | 2.5 | | list | 0.277 | 1.822 | 2 | | try/except | 0.158 | 1.068 | 1.2 | | defaultdict | 0.141 | 0.925 | 1 | | numpy | 0.012 | 0.076 | 0.082 | | S.Mark's ext. | 0.003 | 0.019 | 0.021 | | ext. in Cython | 0.001 | 0.008 | 0.0086 | #+TBLFM: $4=$3/@7$3;%.2g
使用的文件:'/usr/share/dict/american- english'和'big.txt'。
'/usr/share/dict/american- english'
'big.txt'
比较“ Counter”,“ setdefault”,“ list”,“ try / except”,“ defaultdict”,“ numpy”,“ cython”和@ S.Mark的解决方案的脚本位于http:// gist。 github.com/347000
最快的解决方案是用Cython编写的Python扩展:
import cython @cython.locals( chars=unicode, i=cython.Py_ssize_t, L=cython.Py_ssize_t[0x10000]) def countchars_cython(chars): for i in range(0x10000): # unicode code points > 0xffff are not supported L[i] = 0 for c in chars: L[c] += 1 return {unichr(i): L[i] for i in range(0x10000) if L[i]}
* python (dict) : 0.5 seconds * python (list) : 0.5 (ascii) (0.2 if read whole file in memory) * perl : 0.5 * python (numpy): 0.07 * c++ : 0.05 * c : 0.008 (ascii)
输入数据:
$ tail /usr/share/dict/american-english éclat's élan élan's émigré émigrés épée épées étude étude's études $ du -h /usr/share/dict/american-english 912K /usr/share/dict/american-english
#!/usr/bin/env python3.1 import collections, fileinput, textwrap chars = (ch for word in fileinput.input() for ch in word.rstrip()) # faster (0.4s) but less flexible: chars = open(filename).read() print(textwrap.fill(str(collections.Counter(chars)), width=79))
运行:
$ time -p python3.1 count_char.py /usr/share/dict/american-english 计数器({'e':87823,'s':86620,'i':66548,'a':62778,'n':56696,'r': 56286,'t':51588,'o':48425,'l':39914,'c':30020,'d':28068,'u':25810, “'”:24511,'g':22262,'p':20917,'m':20747,'h':18453,'b':14137,'y': 12367,'f':10049,'k':7800,'v':7573,'w':6924,'z':3088,'x':2082,'M': 1686,'C':1549,'S':1515,'q':1447,'B':1387,'j':1376,'A':1345,'P': 974,'L':912,'H':860,'T':858,'G':811,'D':809,'R':749,'K':656,'E': 618,'J':539,'N':531,'W':507,'F':502,'O':354,'I':344,'V':330,'Z': 150,'Y':140,'é':128,'U':117,'Q':63,'X':42,'è':29,'ö':12,'ü':12,12, 'ó':10,'á':10,'ä':7,'ê':6,'â':6,'ñ':6,'ç':4,'å':3,'û ':3,'í': 2,“ô”:2,“Å”:1}) 真实0.44 用户0.43 sys 0.01
time -p perl -MData::Dumper -F'' -lanwe'$c{$_}++ for (@F); END{ $Data::Dumper::Terse = 1; $Data::Dumper::Indent = 0; print Dumper(\%c) } ' /usr/share/dict/american-english
输出:
{'S'=> 1515,'K'=> 656,''=> 29,'d'=> 28068,'Y'=> 140,'E'=> 618,'y'=> 12367,' g'=> 22262,'e'=> 87823,''=> 2,'J'=> 539,''=> 241,''=> 3,''=> 6,''=> 4, ''=> 128,'D'=> 809,'q'=> 1447,'b'=> 14137,'z'=> 3088,'w'=> 6924,'Q'=> 63,'' => 10,'M'=> 1686,'C'=> 1549,''=> 10,'L'=> 912,'X'=> 42,'P'=> 974,''=> 12 ,'\''=> 24511,''=> 6,'a'=> 62778,'T'=> 858,'N'=> 531,'j'=> 1376,'Z'=> 150, 'u'=> 25810,'k'=>7800,'t'=> 51588,''=> 6,'W'=> 507,'v'=> 7573,'s'=> 86620,'B'=> 1387,'H'=> 860, 'c'=> 30020,''=> 12,'I'=> 344,''=> 3,'G'=> 811,'U'=> 117,'F'=> 502,''= > 2,'r'=> 56286,'x'=> 2082,'V'=> 330,'h'=> 18453,'f'=> 10049,''=> 1,'i'=> 66548 ,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917,' R'=> 749,'o'=> 48425}'c'=> 30020,''=> 12,'I'=> 344,''=> 3,'G'=> 811,'U'=> 117,'F'=> 502,''= > 2,'r'=> 56286,'x'=> 2082,'V'=> 330,'h'=> 18453,'f'=> 10049,''=> 1,'i'=> 66548 ,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917,' R'=> 749,'o'=> 48425}'c'=> 30020,''=> 12,'I'=> 344,''=> 3,'G'=> 811,'U'=> 117,'F'=> 502,''= > 2,'r'=> 56286,'x'=> 2082,'V'=> 330,'h'=> 18453,'f'=> 10049,''=> 1,'i'=> 66548 ,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917,' R'=> 749,'o'=> 48425}66548,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917, 'R'=> 749,'o'=> 48425}66548,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917, 'R'=> 749,'o'=> 48425} 真实的0.51 用户0.49 sys 0.02
基[Ants Aasma的(已修改为支持unicode):
#!/usr/bin/env python import codecs, itertools, operator, sys import numpy filename = sys.argv[1] if len(sys.argv)>1 else '/usr/share/dict/american-english' # ucs2 or ucs4 python? dtype = {2: numpy.uint16, 4: numpy.uint32}[len(buffer(u"u"))] # count ordinals text = codecs.open(filename, encoding='utf-8').read() a = numpy.frombuffer(text, dtype=dtype) counts = numpy.bincount(a) # pretty print counts = [(unichr(i), v) for i, v in enumerate(counts) if v] counts.sort(key=operator.itemgetter(1)) print ' '.join('("%s" %d)' % c for c in counts if c[0] not in ' \t\n')
("Å" 1) ("í" 2) ("ô" 2) ("å" 3) ("û" 3) ("ç" 4) ("â" 6) ("ê" 6) ("ñ" 6) ("ä" 7) ("á" 10) ("ó" 10) ("ö" 12) ("ü" 12) ("è" 29) ("X" 42) ("Q" 63) ("U" 117) ("é" 128) ("Y" 140) ("Z" 150) ("V" 330) ("I" 344) ("O" 354) ("F" 502) ("W" 507) ("N" 531) ("J" 539) ("E" 618) ("K" 656) ("R" 749) ("D" 809) ("G" 811) ("T" 858) ("H" 860) ("L" 912) ("P" 974) ("A" 1345) ("j" 1376) ("B" 1387) ("q" 1447) ("S" 1515) ("C" 1549) ("M" 1686) ("x" 2082) ("z" 3088) ("w" 6924) ("v" 7573) ("k" 7800) ("f" 10049) ("y" 12367) ("b" 14137) ("h" 18453) ("m" 20747) ("p" 20917) ("g" 22262) ("'" 24511) ("u" 25810) ("d" 28068) ("c" 30020) ("l" 39914) ("o" 48425) ("t" 51588) ("r" 56286) ("n" 56696) ("a" 62778) ("i" 66548) ("s" 86620) ("e" 87823) real 0.07 user 0.06 sys 0.01
// $ g++ *.cc -lboost_program_options // $ ./a.out /usr/share/dict/american-english #include <iostream> #include <fstream> #include <cstdlib> // exit #include <boost/program_options/detail/utf8_codecvt_facet.hpp> #include <boost/tr1/unordered_map.hpp> #include <boost/foreach.hpp> int main(int argc, char* argv[]) { using namespace std; // open input file if (argc != 2) { cerr << "Usage: " << argv[0] << " <filename>\n"; exit(2); } wifstream f(argv[argc-1]); // assume the file has utf-8 encoding locale utf8_locale(locale(""), new boost::program_options::detail::utf8_codecvt_facet); f.imbue(utf8_locale); // count characters frequencies typedef std::tr1::unordered_map<wchar_t, size_t> hashtable_t; hashtable_t counts; for (wchar_t ch; f >> ch; ) counts[ch]++; // print result wofstream of("output.utf8"); of.imbue(utf8_locale); BOOST_FOREACH(hashtable_t::value_type i, counts) of << "(" << i.first << " " << i.second << ") "; of << endl; }
结果:
$ cat output.utf8 (í2)(O 354)(P 974)(Q 63)(R 749)(S 1,515)(ñ6)(T 858)(U 117)(ó10)(ô2)(V 330)(W 507)(X 42)(ö12)(Y 140)(Z 150)(û3)(ü12)(a 62,778)(b 14,137)(c 30,020)(d 28,068)(e 87,823)(f 10,049) (g 22,262)(h 18,453)(i 66,548)(j 1,376)(k 7,800)(l 39,914)(m 20,747)(n 56,696)(o 48,425)(p 20,917)(q 1,447)(r 56,286)(s 86,620)(t 51,588)(u 25,810)(Å1)('24,511)(v 7,573)(w 6,924)(x 2,082)(y 12,367)(z 3,088)(A 1,345)(B 1,387)(C 1,549) (á10)(â6)(D 809)(E 618)(F 502)(ä7)(å3)(G 811)(H 860)(ç4)(I 344)(J 539)(è 29)(K 656)(é128)(ê6)(L 912)(M 1,686)(N 531)
// $ gcc -O3 cc_ascii.c -o cc_ascii && time -p ./cc_ascii < input.txt #include <stdio.h> enum { N = 256 }; size_t counts[N]; int main(void) { // count characters int ch = -1; while((ch = getchar()) != EOF) ++counts[ch]; // print result size_t i = 0; for (; i < N; ++i) if (counts[i]) printf("('%c' %zu) ", (int)i, counts[i]); return 0; }