小编典典

Python-字典查找每个字符的频率慢吗?

algorithm

我正在尝试使用O(n)复杂度的算法查找任何给定文本中每个符号的频率。我的算法如下:

s = len(text) 
P = 1.0/s 
freqs = {} 
for char in text: 
    try: 
       freqs[char]+=P 
    except: 
       freqs[char]=P

但我怀疑这种字典方法是否足够快,因为它取决于字典方法的基础实现。这是最快的方法吗?

更新:如果使用集合和整数,则速度不会增加。这是因为该算法已经具有O(n)的复杂度,因此根本不可能进行加速。

例如,1MB文本的结果:

without collections:
real    0m0.695s

with collections:
real    0m0.625s

阅读 332

收藏
2020-07-28

共1个答案

小编典典

性能比较

注意:表中的时间不包括加载文件所需的时间。

| approach       | american-english, |      big.txt, | time w.r.t. defaultdict |
|                |     time, seconds | time, seconds |                         |
|----------------+-------------------+---------------+-------------------------|
| Counter        |             0.451 |         3.367 |                     3.6 |
| setdefault     |             0.348 |         2.320 |                     2.5 |
| list           |             0.277 |         1.822 |                       2 |
| try/except     |             0.158 |         1.068 |                     1.2 |
| defaultdict    |             0.141 |         0.925 |                       1 |
| numpy          |             0.012 |         0.076 |                   0.082 |
| S.Mark's ext.  |             0.003 |         0.019 |                   0.021 |
| ext. in Cython |             0.001 |         0.008 |                  0.0086 |
#+TBLFM: $4=$3/@7$3;%.2g

使用的文件:'/usr/share/dict/american- english''big.txt'

比较“ Counter”,“ setdefault”,“ list”,“ try / except”,“ defaultdict”,“ numpy”,“
cython”和@ S.Mark的解决方案的脚本位于http:// gist。
github.com/347000

最快的解决方案是用Cython编写的Python扩展:

import cython

@cython.locals(
    chars=unicode,
    i=cython.Py_ssize_t,
    L=cython.Py_ssize_t[0x10000])
def countchars_cython(chars):
    for i in range(0x10000): # unicode code points > 0xffff are not supported
        L[i] = 0

    for c in chars:
        L[c] += 1

    return {unichr(i): L[i] for i in range(0x10000) if L[i]}

以前的比较:

* python (dict) : 0.5  seconds
* python (list) : 0.5  (ascii) (0.2 if read whole file in memory)
* perl          : 0.5
* python (numpy): 0.07 
* c++           : 0.05
* c             : 0.008 (ascii)

输入数据:

$ tail /usr/share/dict/american-english
éclat's
élan
élan's
émigré
émigrés
épée
épées
étude
étude's
études

$ du -h /usr/share/dict/american-english
912K    /usr/share/dict/american-english

python(计数器):0.5秒

#!/usr/bin/env python3.1
import collections, fileinput, textwrap

chars = (ch for word in fileinput.input() for ch in word.rstrip())
# faster (0.4s) but less flexible: chars = open(filename).read()
print(textwrap.fill(str(collections.Counter(chars)), width=79))

运行:

$ time -p python3.1 count_char.py /usr/share/dict/american-english



计数器({'e':87823,'s':86620,'i':66548,'a':62778,'n':56696,'r':
56286,'t':51588,'o':48425,'l':39914,'c':30020,'d':28068,'u':25810,
“'”:24511,'g':22262,'p':20917,'m':20747,'h':18453,'b':14137,'y':
12367,'f':10049,'k':7800,'v':7573,'w':6924,'z':3088,'x':2082,'M':
1686,'C':1549,'S':1515,'q':1447,'B':1387,'j':1376,'A':1345,'P':
974,'L':912,'H':860,'T':858,'G':811,'D':809,'R':749,'K':656,'E':
618,'J':539,'N':531,'W':507,'F':502,'O':354,'I':344,'V':330,'Z':
150,'Y':140,'é':128,'U':117,'Q':63,'X':42,'è':29,'ö':12,'ü':12,12,
'ó':10,'á':10,'ä':7,'ê':6,'â':6,'ñ':6,'ç':4,'å':3,'û ':3,'í':
2,“ô”:2,“Å”:1})
真实0.44
用户0.43
sys 0.01

perl:0.5秒

time -p perl -MData::Dumper -F'' -lanwe'$c{$_}++ for (@F);
END{ $Data::Dumper::Terse = 1; $Data::Dumper::Indent = 0; print Dumper(\%c) }
' /usr/share/dict/american-english

输出:

{'S'=> 1515,'K'=> 656,''=> 29,'d'=> 28068,'Y'=> 140,'E'=> 618,'y'=> 12367,' g'=> 22262,'e'=> 87823,''=> 2,'J'=> 539,''=> 241,''=> 3,''=> 6,''=> 4, ''=> 128,'D'=> 809,'q'=> 1447,'b'=> 14137,'z'=> 3088,'w'=> 6924,'Q'=> 63,'' => 10,'M'=> 1686,'C'=> 1549,''=> 10,'L'=> 912,'X'=> 42,'P'=> 974,''=> 12 ,'\''=> 24511,''=> 6,'a'=> 62778,'T'=> 858,'N'=> 531,'j'=> 1376,'Z'=> 150, 'u'=> 25810,'k'=>7800,'t'=> 51588,''=> 6,'W'=> 507,'v'=> 7573,'s'=> 86620,'B'=> 1387,'H'=> 860, 'c'=> 30020,''=> 12,'I'=> 344,''=> 3,'G'=> 811,'U'=> 117,'F'=> 502,''= > 2,'r'=> 56286,'x'=> 2082,'V'=> 330,'h'=> 18453,'f'=> 10049,''=> 1,'i'=> 66548 ,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917,' R'=> 749,'o'=> 48425}'c'=> 30020,''=> 12,'I'=> 344,''=> 3,'G'=> 811,'U'=> 117,'F'=> 502,''= > 2,'r'=> 56286,'x'=> 2082,'V'=> 330,'h'=> 18453,'f'=> 10049,''=> 1,'i'=> 66548 ,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917,' R'=> 749,'o'=> 48425}'c'=> 30020,''=> 12,'I'=> 344,''=> 3,'G'=> 811,'U'=> 117,'F'=> 502,''= > 2,'r'=> 56286,'x'=> 2082,'V'=> 330,'h'=> 18453,'f'=> 10049,''=> 1,'i'=> 66548 ,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917,' R'=> 749,'o'=> 48425}66548,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917, 'R'=> 749,'o'=> 48425}66548,'A'=> 1345,'O'=> 354,'n'=> 56696,'m'=> 20747,'l'=> 39914,''=> 7,'p'=> 20917, 'R'=> 749,'o'=> 48425}
真实的0.51
用户0.49
sys 0.02

python(numpy):0.07秒

基[Ants Aasma的(已修改为支持unicode):

#!/usr/bin/env python
import codecs, itertools, operator, sys
import numpy

filename = sys.argv[1] if len(sys.argv)>1 else '/usr/share/dict/american-english'

# ucs2 or ucs4 python?
dtype = {2: numpy.uint16, 4: numpy.uint32}[len(buffer(u"u"))]

# count ordinals
text = codecs.open(filename, encoding='utf-8').read()
a = numpy.frombuffer(text, dtype=dtype)
counts = numpy.bincount(a)

# pretty print
counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
counts.sort(key=operator.itemgetter(1))
print ' '.join('("%s" %d)' % c for c in counts  if c[0] not in ' \t\n')

输出:

("Å" 1) ("í" 2) ("ô" 2) ("å" 3) ("û" 3) ("ç" 4) ("â" 6) ("ê" 6) ("ñ" 6) ("ä" 7) ("á" 10) ("ó" 10) ("ö" 12) ("ü" 12) ("è" 29) ("X" 42) ("Q" 63) ("U" 117) ("é" 128) ("Y" 140) ("Z" 150) ("V" 330) ("I" 344) ("O" 354) ("F" 502) ("W" 507) ("N" 531) ("J" 539) ("E" 618) ("K" 656) ("R" 749) ("D" 809) ("G" 811) ("T" 858) ("H" 860) ("L" 912) ("P" 974) ("A" 1345) ("j" 1376) ("B" 1387) ("q" 1447) ("S" 1515) ("C" 1549) ("M" 1686) ("x" 2082) ("z" 3088) ("w" 6924) ("v" 7573) ("k" 7800) ("f" 10049) ("y" 12367) ("b" 14137) ("h" 18453) ("m" 20747) ("p" 20917) ("g" 22262) ("'" 24511) ("u" 25810) ("d" 28068) ("c" 30020) ("l" 39914) ("o" 48425) ("t" 51588) ("r" 56286) ("n" 56696) ("a" 62778) ("i" 66548) ("s" 86620) ("e" 87823)
real 0.07
user 0.06
sys 0.01

C ++:0.05秒

// $ g++ *.cc -lboost_program_options 
// $ ./a.out /usr/share/dict/american-english    
#include <iostream>
#include <fstream>
#include <cstdlib> // exit

#include <boost/program_options/detail/utf8_codecvt_facet.hpp>
#include <boost/tr1/unordered_map.hpp>
#include <boost/foreach.hpp>

int main(int argc, char* argv[]) {
  using namespace std;

  // open input file
  if (argc != 2) {
    cerr << "Usage: " << argv[0] << " <filename>\n";
    exit(2);
  }
  wifstream f(argv[argc-1]);

  // assume the file has utf-8 encoding
  locale utf8_locale(locale(""), 
      new boost::program_options::detail::utf8_codecvt_facet);
  f.imbue(utf8_locale);

  // count characters frequencies
  typedef std::tr1::unordered_map<wchar_t, size_t> hashtable_t;  
  hashtable_t counts;
  for (wchar_t ch; f >> ch; )
    counts[ch]++;

  // print result
  wofstream of("output.utf8");
  of.imbue(utf8_locale);
  BOOST_FOREACH(hashtable_t::value_type i, counts) 
    of << "(" << i.first << " " << i.second << ") ";
  of << endl;
}

结果:

$ cat output.utf8



(í2)(O 354)(P 974)(Q 63)(R 749)(S 1,515)(ñ6)(T 858)(U 117)(ó10)(ô2)(V 330)(W 507)(X 42)(ö12)(Y 140)(Z 150)(û3)(ü12)(a 62,778)(b 14,137)(c 30,020)(d 28,068)(e 87,823)(f 10,049) (g 22,262)(h 18,453)(i 66,548)(j 1,376)(k 7,800)(l 39,914)(m 20,747)(n 56,696)(o 48,425)(p 20,917)(q 1,447)(r 56,286)(s 86,620)(t 51,588)(u 25,810)(Å1)('24,511)(v 7,573)(w 6,924)(x 2,082)(y 12,367)(z 3,088)(A 1,345)(B 1,387)(C 1,549) (á10)(â6)(D 809)(E 618)(F 502)(ä7)(å3)(G 811)(H 860)(ç4)(I 344)(J 539)(è 29)(K 656)(é128)(ê6)(L 912)(M 1,686)(N 531)

c(ascii):0.0079秒

// $ gcc -O3 cc_ascii.c -o cc_ascii && time -p ./cc_ascii < input.txt
#include <stdio.h>

enum { N = 256 };
size_t counts[N];

int main(void) {
  // count characters
  int ch = -1;
  while((ch = getchar()) != EOF)
    ++counts[ch];

  // print result
  size_t i = 0;
  for (; i < N; ++i) 
    if (counts[i])
      printf("('%c' %zu) ", (int)i, counts[i]);
  return 0;
}
2020-07-28