help make it faster please

pkilambi · Nov 10, 2005

I wrote this function which does the following:
after readling lines from file.It splits and finds the word occurences
through a hash table...for some reason this is quite slow..can some one
help me make it faster...
f = open(filename)
lines = f.readlines()
def create_words(lines):
cnt = 0
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
for content in lines:
words=content.split()
countDict={}
wordlist = []
for w in words:
w=string.lower(w)
if w[-1] in spl_set: w = w[:-1]
if w != '':
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1
wordlist = countDict.keys()
wordlist.sort()
cnt += 1
if countDict != {}:
for word in wordlist: print (word+' '+
str(countDict[word])+'\n')

bonono · Nov 10, 2005

why reload wordlist and sort it after each word processing ? seems that
it can be done after the for loop.

pkilambi · Nov 10, 2005

Actually I create a seperate wordlist for each so called line.Here line
I mean would be a paragraph in future...so I will have to recreate the
wordlist for each loop

pkilambi · Nov 10, 2005

Oh sorry indentation was messed here...the
wordlist = countDict.keys()
wordlist.sort()
should be outside the word loop.... now
def create_words(lines):
cnt = 0
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
for content in lines:
words=content.split()
countDict={}
wordlist = []
for w in words:
w=string.lower(w)
if w[-1] in spl_set: w = w[:-1]
if w != '':
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1
wordlist = countDict.keys()
wordlist.sort()
cnt += 1
if countDict != {}:
for word in wordlist: print (word+' '+
str(countDict[word])+'\n')

ok now this is the correct question I am asking...

bonono · Nov 10, 2005

don't know your intend so have no idea what it is for. However, you are
doing :

wordlist=contDict.keys()
wordlist.sort()

for every word processed yet you don't use the content of x in anyway
during the loop. Even if you need one fresh snapshot of contDict after
each word, I don't see the need for sorting. seems like wasting cycles
to me.

Lonnie Princehouse · Nov 10, 2005

You're making a new countDict for each line read from the file... is
that what you meant to do? Or are you trying to count word occurrences
across the whole file?

--

In general, any time string manipulation is going slowly, ask yourself,
"Can I use the re module for this?"

# disclaimer: untested code. probably contains typos

import re
word_finder = re.compile('[a-z0-9_]+', re.I)

def count_words (string, word_finder = word_finder): # avoid global
lookups
countDict = {}
for match in word_finder.finditer(string):
word = match.group(0)
countDict[word] = countDict.get(word,0) + 1
return countDict

f = open(filename)
for i, line in enumerate(f.xreadlines()):
countDict = count_words(line)
print "Line %s" % i
for word in sorted(countDict.keys()):
print " %s %s" % (word, countDict[word])

f.close()

bearophileHUGS · Nov 10, 2005

This can be faster, it avoids doing the same things more times:

from string import maketrans, ascii_lowercase, ascii_uppercase

def create_words(afile):
stripper = """'[",;<>{}_&?!():[]\.=+-*\t\n\r^%0123456789/"""
mapper = maketrans(stripper + ascii_uppercase,
" "*len(stripper) + ascii_lowercase)
countDict = {}
for line in afile:
for w in line.translate(mapper).split():
if w:
if w in countDict:
countDict[w] += 1
else:
countDict[w] = 1
word_freq = countDict.items()
word_freq.sort()
for word, freq in word_freq:
print word, freq

create_words(file("test.txt"))

If you can load the whole file in memory then it can be made a little
faster...

Bear hugs,
bearophile

Larry Bates · Nov 10, 2005

I wrote this function which does the following:
after readling lines from file.It splits and finds the word occurences
through a hash table...for some reason this is quite slow..can some one
help me make it faster...
f = open(filename)
lines = f.readlines()
def create_words(lines):
cnt = 0
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
for content in lines:
words=content.split()
countDict={}
wordlist = []
for w in words:
w=string.lower(w)
if w[-1] in spl_set: w = w[:-1]
if w != '':
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1
wordlist = countDict.keys()
wordlist.sort()
cnt += 1
if countDict != {}:
for word in wordlist: print (word+' '+
str(countDict[word])+'\n')

The way this is written you create a new countDict object
for every line of the file, it's not clear that this is
what you meant to do.

Also you are sorting wordlist for every line, not just
the entire file because it is inside the loop that is
processing lines.

Some extra work by testing for empty dictionary:

wordlist=countDict.keys()

then

if countdict != {}:
for word in wordlist:

if countDict is empty then wordlist will be empty so testing
for it is unnecessary.

Incrementing cnt, but never using it.

I don't think spl_set will do what you want, but I haven't modified
it. To split on all those characters you are going to need to
use regular expressions not split.

Modified code:

def create_words(lines):
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
countDict={}
for content in lines:
words=content.split()
for w in words:
w=w.lower()
if w[-1] in spl_set: w = w[:-1]
if w:
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1

return countDict

import time
filename=r'C:\cygwin\usr\share\vim\vim63\doc\version5.txt'
f = open(filename)
lines = f.readlines()
start_time=time.time()
countDict=create_words(lines)
stop_time=time.time()
elapsed_time=stop_time-start_time
wordlist = countDict.keys()
wordlist.sort()
for word in wordlist:
print "word=%s count=%i" % (word, countDict[word])

print "Elapsed time in create_words function=%.2f seconds" % elapsed_time

I ran this against a 551K text file and it runs in 0.11 seconds
on my machine (3.0Ghz P4).

Larry Bates

pkilambi · Nov 10, 2005

ok this sounds much better..could you tell me what to do if I want to
leave characters like @ in words.So I would like to consider this as a
part of word

Lonnie Princehouse · Nov 10, 2005

The word_finder regular expression defines what will be considered a
word.

"[a-z0-9_]" means "match a single character from the set {a through z,
0 through 9, underscore}".
The + means "match as many as you can, minimum of one"

To match @ as well, add it to the set of characters to match:

word_finder = re.compile('[a-z0-9_@]+', re.I)

The re.I flag makes the expression case insensitive.
See the documentation for re for more information.

Also--- It looks like I forgot to lowercase matched words. The line
word = match.group(0)
should read:
word = match.group(0).lower()

Fredrik Lundh · Nov 10, 2005

Lonnie said:
"[a-z0-9_]" means "match a single character from the set {a through z,
0 through 9, underscore}".

"\w" should be a bit faster; it's equivalent to "[a-zA-Z0-9_]" (unless you
specify otherwise using the locale or unicode flags), but is handled more
efficiently by the RE engine.

(you can use \w in sets too, so you can do e.g. "[\w@]")

</F>

Sion Arrowsmith · Nov 11, 2005

Oh sorry indentation was messed here...the
wordlist = countDict.keys()
wordlist.sort()
should be outside the word loop.... now
def create_words(lines):
cnt = 0
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
for content in lines:
words=content.split()
countDict={}
wordlist = []
for w in words:
w=string.lower(w)
if w[-1] in spl_set: w = w[:-1]
if w != '':
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1
wordlist = countDict.keys()
wordlist.sort()
cnt += 1
if countDict != {}:
for word in wordlist: print (word+' '+
str(countDict[word])+'\n')

ok now this is the correct question I am asking...

(a) You might be better off doing:
words = words.lower()
for w in words:
...
instead of calling lower() on each separate word (and note that most
functions from string are deprecated in favour of string methods).

(b) spl_set isn't doing what you might think it is -- it looks like
you've written it as a regexp but your using it as a character set.
What you might want is:
spl_set = '",;<>{}_&?!():-[\.=+*\t\n\r'
and
while w[-1] in spl_set: w = w[:-1]
That loop can be written:
w = w.rstrip(spl_set)
(which by my timings is faster if you have multiple characters from
spl_set at the end of your word, but slower if you have 0 or 1).

Bengt Richter · Nov 11, 2005

This can be faster, it avoids doing the same things more times:

from string import maketrans, ascii_lowercase, ascii_uppercase

def create_words(afile):
stripper = """'[",;<>{}_&?!():[]\.=+-*\t\n\r^%0123456789/"""
mapper = maketrans(stripper + ascii_uppercase,
" "*len(stripper) + ascii_lowercase)

good way to prepare for split

countDict = {}
for line in afile:
for w in line.translate(mapper).split():
if w:

I suspect it's not possible to get '' in the list from somestring.split()

if w in countDict:
countDict[w] += 1
else:
countDict[w] = 1

does that beat the try and get versions? I.e., (untested)
try: countDict[w] += 1
except KeyError: countDict[w] = 1
or
countDict[w] = countDict.get(w, 0) + 1

word_freq = countDict.items()
word_freq.sort()
for word, freq in word_freq:
print word, freq

create_words(file("test.txt"))

If you can load the whole file in memory then it can be made a little
faster...

Bear hugs,
bearophile

Regards,
Bengt Richter

Sybren Stuvel · Nov 12, 2005

Bengt Richter enlightened us with:

I suspect it's not possible to get '' in the list from
somestring.split()

Time to adjust your suspicions:
['', 'abc', '']

countDict[w] += 1
else:
countDict[w] = 1

Click to expand...

does that beat the try and get versions? I.e., (untested)
try: countDict[w] += 1
except KeyError: countDict[w] = 1

OPs way is faster. A 'try' block is very fast, but trowing & handling
an exception is slow.

countDict[w] = countDict.get(w, 0) + 1

I think this has the potential of being faster than OPs method,
because it's likely to be fully implemented in C.

Sybren

bearophileHUGS · Nov 12, 2005

Thank you Bengt Richter and Sybren Stuvel for your comments, my little
procedure can be improved a bit in many ways, it was just a first
quickly written version (but it can be enough for a basic usage).

Bengt Richter:

good way to prepare for split

Maybe there is a better way, that is putting in just the accepted
letters and accented letters, instead of the not accepted symbols.

I suspect it's not possible to get '' in the list from somestring.split()

You are probably right, the algorithm used is different if you don't
give a splitting string:

' abc a '.split() ['abc', 'a']
'.abc..a.'.split(".")

Click to expand...

Click to expand...

['', 'abc', '', 'a', '']

does that beat the try and get versions? I.e., (untested)
Yes.

countDict[w] = countDict.get(w, 0) + 1

I think the if-else version is the fastest, I have tested it a long
time ago... You can easely do a speed test to see if I am wrong.

Bear hugs,
bearophile

Bengt Richter · Nov 12, 2005

Bengt Richter enlightened us with:

I suspect it's not possible to get '' in the list from
somestring.split()

Click to expand...

Time to adjust your suspicions:
['', 'abc', '']

I know about that one ;-)
I meant somestring.split() just like that -- without a splitter argument.
My suspicion remains ;-)

countDict[w] += 1
else:
countDict[w] = 1

Click to expand...

does that beat the try and get versions? I.e., (untested)
try: countDict[w] += 1
except KeyError: countDict[w] = 1

Click to expand...

OPs way is faster. A 'try' block is very fast, but trowing & handling
an exception is slow.

Yes, so it's data sensitive. Same with the other one not mentioned:
countDict.setdefault(w, [0])[0] += 1 # accum count in length one list vs as int
# too much subscripting work though

countDict[w] = countDict.get(w, 0) + 1

Click to expand...

I think this has the potential of being faster than OPs method,
because it's likely to be fully implemented in C.

Not sure what you mean. Don't forget, all those expressions are evaluated dynamically
step by step with byte codes being switched on in c, but still a bunch of steps. I.e.,

>>> import dis
>>> dis.dis(compile('countDict[w] = countDict.get(w, 0) + 1', '', 'exec'))

Click to expand...

Click to expand...

1 0 LOAD_NAME 0 (countDict)
3 LOAD_ATTR 1 (get)
6 LOAD_NAME 2 (w)
9 LOAD_CONST 0 (0)
12 CALL_FUNCTION 2
15 LOAD_CONST 1 (1)
18 BINARY_ADD
19 LOAD_NAME 0 (countDict)
22 LOAD_NAME 2 (w)
25 STORE_SUBSCR
26 LOAD_CONST 2 (None)
29 RETURN_VALUE

Looks like if you succeed most of the time without a KeyError, the try/except might be
a tad faster, depending on what the exception block setup/teardown costs.
... try: countDict[w] += 1
... except KeyError: countDict[w] = 1
... """, '', 'exec'))
1 0 SETUP_EXCEPT 20 (to 23)
3 LOAD_NAME 0 (countDict)
6 LOAD_NAME 1 (w)
9 DUP_TOPX 2
12 BINARY_SUBSCR
13 LOAD_CONST 0 (1)
16 INPLACE_ADD
17 ROT_THREE
18 STORE_SUBSCR
19 POP_BLOCK
20 JUMP_FORWARD 29 (to 52)

2 >> 23 DUP_TOP
24 LOAD_NAME 2 (KeyError)
27 COMPARE_OP 10 (exception match)
30 JUMP_IF_FALSE 17 (to 50)
33 POP_TOP
34 POP_TOP
35 POP_TOP
36 POP_TOP
37 LOAD_CONST 0 (1)
40 LOAD_NAME 0 (countDict)
43 LOAD_NAME 1 (w)
46 STORE_SUBSCR
47 JUMP_FORWARD 2 (to 52) 55 RETURN_VALUE

Regards,
Bengt Richter

Sybren Stuvel · Nov 13, 2005

Bengt Richter enlightened us with:

I meant somestring.split() just like that -- without a splitter
argument. My suspicion remains ;-)

Mine too ;-)

Sybren

Ron Adam · Nov 13, 2005

Fredrik said:
Lonnie Princehouse wrote:

"[a-z0-9_]" means "match a single character from the set {a through z,
0 through 9, underscore}".

Click to expand...

"\w" should be a bit faster; it's equivalent to "[a-zA-Z0-9_]" (unless you
specify otherwise using the locale or unicode flags), but is handled more
efficiently by the RE engine.

(you can use \w in sets too, so you can do e.g. "[\w@]")

</F>

The \w does make a small difference, but not as much as I expected.
Surprisingly a standard Python word iterator works just as well, and is
easier to understand than the re version.

Which one is faster depends on the average word length and number of
ignored characters.

Cheers,
Ron

Character count: 100000
Word count: 16477
Average word size: 6.06906597075
word_counter: 0.06820057 (best of 3)
count_words: 0.07333837 (best of 3)

#########

import string
import re
import time
import random

# Create a really ugly n length string to test with.
n = 100000
random.seed(1)
lines = ''.join([ random.choice(string.ascii_letters * 2
+ '_@$&*()#/<>' + ' \n' * 6) for x in range(n) ])
print 'Character count:', n
print 'Word count:', len(lines.split())
print 'Average word size:', float(n)/len(lines.split())

letters = string.lowercase + string.digits + '_@'

def word_iter(text, letters=letters):
ltrs=letters.lower()
wd = ''
for c in text + ' ':
if c in ltrs:
wd += c
elif wd != '':
yield wd
wd = ''

def word_counter(text):
txt = text.lower()
countDict={}
for wd in word_iter(txt):
if wd in countDict:
countDict[wd] += 1
else:
countDict[wd] = 1
return countDict

word_finder = re.compile('[\w@]+', re.I)

def count_words(string, word_finder = word_finder):
# avoid global lookups
countDict = {}
for match in word_finder.finditer(string.lower()):
word = match.group(0)
countDict[word] = countDict.get(word,0) + 1
return countDict

foos = [word_counter, count_words]
r1 = r2 = None
for foo in foos:
best_time = 0
for n in range(3):
t = time.clock()
for line in lines.splitlines():
countDict = foo(line)
tt = time.clock()-t
if tt > best_time: best_time = tt

r1 = r2
r2 = countDict
if r1 != None:
# change to 1 if assert fails to find problem
if 0:
for k in r1.keys():
if r1[k] != r2[k]:
print k,r1[k],r2[k]
assert r1 == r2

print '%s: %.8f (best of %d)' \
% (foo.__name__, best_time, n+1)

Fredrik Lundh · Nov 13, 2005

Ron said:
The \w does make a small difference, but not as much as I expected.

that's probably because your benchmark has a lot of dubious overhead:

word_finder = re.compile('[\w@]+', re.I)

no need to force case-insensitive search here; \w looks for both lower-
and uppercase characters.

for match in word_finder.finditer(string.lower()):

since you're using a case-insensitive RE, that lower() call is not necessary.

word = match.group(0)

and findall() is of course faster than finditer() + m.group().

t = time.clock()
for line in lines.splitlines():
countDict = foo(line)
tt = time.clock()-t

and if you want performance, why are you creating a new dictionary for
each line in the sample?

here's a more optimized RE word finder:

word_finder_2 = re.compile('[\w@]+').findall

def count_words_2(string, word_finder=word_finder_2):
# avoid global lookups
countDict = {}
for word in word_finder(string):
countDict[word] = countDict.get(word,0) + 1
return countDict

with your original test on a slow machine, I get

count_words: 0.29868684 (best of 3)
count_words_2: 0.17244873 (best of 3)

if I call the function once, on the entire sample string, I get

count_words: 0.23096036 (best of 3)
count_words_2: 0.11690620 (best of 3)

</F>

Ron Adam · Nov 13, 2005

Fredrik said:
Ron Adam wrote:

that's probably because your benchmark has a lot of dubious overhead:

I think it does what the OP described, but that may not be what he
really needs.

Although the test to find best of n, instead was finding worse of n.
Which explains why I was getting a larger variance than I thought I
should have been getting.

word_finder = re.compile('[\w@]+', re.I)

Click to expand...

no need to force case-insensitive search here; \w looks for both lower-
and uppercase characters.

But the dictionary keys need to be either upper or lower otherwise you
count 'The' separately from 'the'.

since you're using a case-insensitive RE, that lower() call is not necessary.

and findall() is of course faster than finditer() + m.group().

Cool, I don't use re that often so I just used what was posted to test
against.

and if you want performance, why are you creating a new dictionary for
each line in the sample?

Because that's what the OP apparently wanted. A line by line word
count. I originally did it to get an the over all count and then change
it so it matched the re version that was posted.

here's a more optimized RE word finder:

word_finder_2 = re.compile('[\w@]+').findall

def count_words_2(string, word_finder=word_finder_2):
# avoid global lookups
countDict = {}
for word in word_finder(string):
countDict[word] = countDict.get(word,0) + 1
return countDict

with your original test on a slow machine, I get

count_words: 0.29868684 (best of 3)
count_words_2: 0.17244873 (best of 3)

if I call the function once, on the entire sample string, I get

count_words: 0.23096036 (best of 3)
count_words_2: 0.11690620 (best of 3)

</F>

Wow, a lot bigger difference than on my machine. <curious> An athlon
64 3000+ on winxp. I'm not sure how much difference that would make?

This is what I get after adding the above version to it, with the
lower(). There's not quite as big a difference as you get, but the find
all version is still faster than both the others.

Cheers,
Ron

Character count: 100000
Word count: 16477
Average word size: 6.06906597075
word_counter: 0.06245989 (best of 3)
count_words: 0.07309812 (best of 3)
count_words_2: 0.04981024 (best of 3)

And as count all words...

Character count: 100000
Word count: 16477
Average word size: 6.06906597075
word_counter: 0.05325006 (best of 3)
count_words: 0.05910528 (best of 3)
count_words_2: 0.03748158 (best of 3)

They all improve, but the re find all version is clearly better.

#####################

import string
import re
import time
import random

# Create a really ugly n length string to test with.
# The word length are
n = 100000
random.seed(1)
lines = ''.join([ random.choice(string.ascii_letters * 2
+ '_@$&*()#/<>' + ' \n' * 6) for x in range(n) ])
print 'Character count:', n
print 'Word count:', len(lines.split())
print 'Average word size:', float(n)/len(lines.split())

letters = string.lowercase + string.digits + '_@'

def word_iter(text, letters=letters):
wd = ''
for c in text + ' ':
if c in letters:
wd += c
elif wd != '':
yield wd
wd = ''

def word_counter(text):
countDict={}
for wd in word_iter(text.lower()):
if wd in countDict:
countDict[wd] += 1
else:
countDict[wd] = 1
return countDict

word_finder = re.compile('[\w@]+', re.I).finditer

def count_words(string, word_finder=word_finder):
# avoid global lookups
countDict = {}
for match in word_finder(string.lower()):
word = match.group(0)
countDict[word] = countDict.get(word,0) + 1
return countDict

word_finder_2 = re.compile('[\w@]+').findall

def count_words_2(string, word_finder=word_finder_2):
# avoid global lookups
countDict = {}
for word in word_finder(string.lower()):
countDict[word] = countDict.get(word,0) + 1
return countDict

foos = [word_counter, count_words, count_words_2]
r1 = r2 = None
for foo in foos:
best_time = 1000000 # too large to be useful on purpose
for n in range(3):
t = time.clock()
#for line in lines.splitlines():
countDict = foo(lines)
tt = time.clock()-t
best_time = min(tt, best_time)

r1 = r2
r2 = countDict
if r1 != None:
# change to 1 if assert fails to find problem
if 0:
for k in r1.keys():
if r1[k] != r2[k]:
print k,r1[k],r2[k]
assert r1 == r2

print '%s: %.8f (best of %d)' \
% (foo.__name__, best_time, n+1)

Code help please	4	May 19, 2023
Arduino Code Please Help	0	Oct 29, 2024
Push regex search result into hash with multiple values	14	May 19, 2014
Translater + module + tkinter	1	Feb 16, 2023
Can't solve problems! please Help	0	Sep 26, 2022
How can I make this piece of code even faster?	18	Jul 20, 2013
Word matching with specific parameters	1	Jan 26, 2025
A website that I couldn't make a screenshot of it nor save any page from.	1	Oct 29, 2023

help make it faster please

pkilambi

bonono

pkilambi

pkilambi

bonono

Lonnie Princehouse

bearophileHUGS

Larry Bates

pkilambi

Lonnie Princehouse

Fredrik Lundh

Sion Arrowsmith

Bengt Richter

Sybren Stuvel

bearophileHUGS

Bengt Richter

Sybren Stuvel

Ron Adam

Fredrik Lundh

Ron Adam

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads