help make it faster please

P

pkilambi

I wrote this function which does the following:
after readling lines from file.It splits and finds the word occurences
through a hash table...for some reason this is quite slow..can some one
help me make it faster...
f = open(filename)
lines = f.readlines()
def create_words(lines):
cnt = 0
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
for content in lines:
words=content.split()
countDict={}
wordlist = []
for w in words:
w=string.lower(w)
if w[-1] in spl_set: w = w[:-1]
if w != '':
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1
wordlist = countDict.keys()
wordlist.sort()
cnt += 1
if countDict != {}:
for word in wordlist: print (word+' '+
str(countDict[word])+'\n')
 
B

bonono

why reload wordlist and sort it after each word processing ? seems that
it can be done after the for loop.
 
P

pkilambi

Actually I create a seperate wordlist for each so called line.Here line
I mean would be a paragraph in future...so I will have to recreate the
wordlist for each loop
 
P

pkilambi

Oh sorry indentation was messed here...the
wordlist = countDict.keys()
wordlist.sort()
should be outside the word loop.... now
def create_words(lines):
cnt = 0
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
for content in lines:
words=content.split()
countDict={}
wordlist = []
for w in words:
w=string.lower(w)
if w[-1] in spl_set: w = w[:-1]
if w != '':
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1
wordlist = countDict.keys()
wordlist.sort()
cnt += 1
if countDict != {}:
for word in wordlist: print (word+' '+
str(countDict[word])+'\n')

ok now this is the correct question I am asking...
 
B

bonono

don't know your intend so have no idea what it is for. However, you are
doing :

wordlist=contDict.keys()
wordlist.sort()

for every word processed yet you don't use the content of x in anyway
during the loop. Even if you need one fresh snapshot of contDict after
each word, I don't see the need for sorting. seems like wasting cycles
to me.
 
L

Lonnie Princehouse

You're making a new countDict for each line read from the file... is
that what you meant to do? Or are you trying to count word occurrences
across the whole file?

--

In general, any time string manipulation is going slowly, ask yourself,
"Can I use the re module for this?"

# disclaimer: untested code. probably contains typos

import re
word_finder = re.compile('[a-z0-9_]+', re.I)

def count_words (string, word_finder = word_finder): # avoid global
lookups
countDict = {}
for match in word_finder.finditer(string):
word = match.group(0)
countDict[word] = countDict.get(word,0) + 1
return countDict

f = open(filename)
for i, line in enumerate(f.xreadlines()):
countDict = count_words(line)
print "Line %s" % i
for word in sorted(countDict.keys()):
print " %s %s" % (word, countDict[word])

f.close()
 
B

bearophileHUGS

This can be faster, it avoids doing the same things more times:

from string import maketrans, ascii_lowercase, ascii_uppercase

def create_words(afile):
stripper = """'[",;<>{}_&?!():[]\.=+-*\t\n\r^%0123456789/"""
mapper = maketrans(stripper + ascii_uppercase,
" "*len(stripper) + ascii_lowercase)
countDict = {}
for line in afile:
for w in line.translate(mapper).split():
if w:
if w in countDict:
countDict[w] += 1
else:
countDict[w] = 1
word_freq = countDict.items()
word_freq.sort()
for word, freq in word_freq:
print word, freq

create_words(file("test.txt"))


If you can load the whole file in memory then it can be made a little
faster...

Bear hugs,
bearophile
 
L

Larry Bates

I wrote this function which does the following:
after readling lines from file.It splits and finds the word occurences
through a hash table...for some reason this is quite slow..can some one
help me make it faster...
f = open(filename)
lines = f.readlines()
def create_words(lines):
cnt = 0
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
for content in lines:
words=content.split()
countDict={}
wordlist = []
for w in words:
w=string.lower(w)
if w[-1] in spl_set: w = w[:-1]
if w != '':
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1
wordlist = countDict.keys()
wordlist.sort()
cnt += 1
if countDict != {}:
for word in wordlist: print (word+' '+
str(countDict[word])+'\n')
The way this is written you create a new countDict object
for every line of the file, it's not clear that this is
what you meant to do.

Also you are sorting wordlist for every line, not just
the entire file because it is inside the loop that is
processing lines.

Some extra work by testing for empty dictionary:

wordlist=countDict.keys()

then

if countdict != {}:
for word in wordlist:

if countDict is empty then wordlist will be empty so testing
for it is unnecessary.

Incrementing cnt, but never using it.

I don't think spl_set will do what you want, but I haven't modified
it. To split on all those characters you are going to need to
use regular expressions not split.


Modified code:

def create_words(lines):
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
countDict={}
for content in lines:
words=content.split()
for w in words:
w=w.lower()
if w[-1] in spl_set: w = w[:-1]
if w:
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1

return countDict


import time
filename=r'C:\cygwin\usr\share\vim\vim63\doc\version5.txt'
f = open(filename)
lines = f.readlines()
start_time=time.time()
countDict=create_words(lines)
stop_time=time.time()
elapsed_time=stop_time-start_time
wordlist = countDict.keys()
wordlist.sort()
for word in wordlist:
print "word=%s count=%i" % (word, countDict[word])

print "Elapsed time in create_words function=%.2f seconds" % elapsed_time

I ran this against a 551K text file and it runs in 0.11 seconds
on my machine (3.0Ghz P4).

Larry Bates
 
P

pkilambi

ok this sounds much better..could you tell me what to do if I want to
leave characters like @ in words.So I would like to consider this as a
part of word
 
L

Lonnie Princehouse

The word_finder regular expression defines what will be considered a
word.

"[a-z0-9_]" means "match a single character from the set {a through z,
0 through 9, underscore}".
The + means "match as many as you can, minimum of one"

To match @ as well, add it to the set of characters to match:

word_finder = re.compile('[a-z0-9_@]+', re.I)

The re.I flag makes the expression case insensitive.
See the documentation for re for more information.


Also--- It looks like I forgot to lowercase matched words. The line
word = match.group(0)
should read:
word = match.group(0).lower()
 
F

Fredrik Lundh

Lonnie said:
"[a-z0-9_]" means "match a single character from the set {a through z,
0 through 9, underscore}".

"\w" should be a bit faster; it's equivalent to "[a-zA-Z0-9_]" (unless you
specify otherwise using the locale or unicode flags), but is handled more
efficiently by the RE engine.

(you can use \w in sets too, so you can do e.g. "[\w@]")

</F>
 
S

Sion Arrowsmith

Oh sorry indentation was messed here...the
wordlist = countDict.keys()
wordlist.sort()
should be outside the word loop.... now
def create_words(lines):
cnt = 0
spl_set = '[",;<>{}_&?!():-[\.=+*\t\n\r]+'
for content in lines:
words=content.split()
countDict={}
wordlist = []
for w in words:
w=string.lower(w)
if w[-1] in spl_set: w = w[:-1]
if w != '':
if countDict.has_key(w):
countDict[w]=countDict[w]+1
else:
countDict[w]=1
wordlist = countDict.keys()
wordlist.sort()
cnt += 1
if countDict != {}:
for word in wordlist: print (word+' '+
str(countDict[word])+'\n')

ok now this is the correct question I am asking...

(a) You might be better off doing:
words = words.lower()
for w in words:
...
instead of calling lower() on each separate word (and note that most
functions from string are deprecated in favour of string methods).

(b) spl_set isn't doing what you might think it is -- it looks like
you've written it as a regexp but your using it as a character set.
What you might want is:
spl_set = '",;<>{}_&?!():-[\.=+*\t\n\r'
and
while w[-1] in spl_set: w = w[:-1]
That loop can be written:
w = w.rstrip(spl_set)
(which by my timings is faster if you have multiple characters from
spl_set at the end of your word, but slower if you have 0 or 1).
 
B

Bengt Richter

This can be faster, it avoids doing the same things more times:

from string import maketrans, ascii_lowercase, ascii_uppercase

def create_words(afile):
stripper = """'[",;<>{}_&?!():[]\.=+-*\t\n\r^%0123456789/"""
mapper = maketrans(stripper + ascii_uppercase,
" "*len(stripper) + ascii_lowercase)
good way to prepare for split
countDict = {}
for line in afile:
for w in line.translate(mapper).split():
if w:
I suspect it's not possible to get '' in the list from somestring.split()
if w in countDict:
countDict[w] += 1
else:
countDict[w] = 1
does that beat the try and get versions? I.e., (untested)
try: countDict[w] += 1
except KeyError: countDict[w] = 1
or
countDict[w] = countDict.get(w, 0) + 1
word_freq = countDict.items()
word_freq.sort()
for word, freq in word_freq:
print word, freq

create_words(file("test.txt"))


If you can load the whole file in memory then it can be made a little
faster...

Bear hugs,
bearophile

Regards,
Bengt Richter
 
S

Sybren Stuvel

Bengt Richter enlightened us with:
I suspect it's not possible to get '' in the list from
somestring.split()

Time to adjust your suspicions:
['', 'abc', '']

countDict[w] += 1
else:
countDict[w] = 1
does that beat the try and get versions? I.e., (untested)
try: countDict[w] += 1
except KeyError: countDict[w] = 1

OPs way is faster. A 'try' block is very fast, but trowing & handling
an exception is slow.
countDict[w] = countDict.get(w, 0) + 1

I think this has the potential of being faster than OPs method,
because it's likely to be fully implemented in C.

Sybren
 
B

bearophileHUGS

Thank you Bengt Richter and Sybren Stuvel for your comments, my little
procedure can be improved a bit in many ways, it was just a first
quickly written version (but it can be enough for a basic usage).

Bengt Richter:
good way to prepare for split

Maybe there is a better way, that is putting in just the accepted
letters and accented letters, instead of the not accepted symbols.

I suspect it's not possible to get '' in the list from somestring.split()

You are probably right, the algorithm used is different if you don't
give a splitting string:
' abc a '.split() ['abc', 'a']
'.abc..a.'.split(".")
['', 'abc', '', 'a', '']

does that beat the try and get versions? I.e., (untested)
Yes.


countDict[w] = countDict.get(w, 0) + 1

I think the if-else version is the fastest, I have tested it a long
time ago... You can easely do a speed test to see if I am wrong.

Bear hugs,
bearophile
 
B

Bengt Richter

Bengt Richter enlightened us with:
I suspect it's not possible to get '' in the list from
somestring.split()

Time to adjust your suspicions:
['', 'abc', '']
I know about that one ;-)
I meant somestring.split() just like that -- without a splitter argument.
My suspicion remains ;-)
countDict[w] += 1
else:
countDict[w] = 1
does that beat the try and get versions? I.e., (untested)
try: countDict[w] += 1
except KeyError: countDict[w] = 1

OPs way is faster. A 'try' block is very fast, but trowing & handling
an exception is slow.
Yes, so it's data sensitive. Same with the other one not mentioned:
countDict.setdefault(w, [0])[0] += 1 # accum count in length one list vs as int
# too much subscripting work though
countDict[w] = countDict.get(w, 0) + 1

I think this has the potential of being faster than OPs method,
because it's likely to be fully implemented in C.
Not sure what you mean. Don't forget, all those expressions are evaluated dynamically
step by step with byte codes being switched on in c, but still a bunch of steps. I.e.,
>>> import dis
>>> dis.dis(compile('countDict[w] = countDict.get(w, 0) + 1', '', 'exec'))
1 0 LOAD_NAME 0 (countDict)
3 LOAD_ATTR 1 (get)
6 LOAD_NAME 2 (w)
9 LOAD_CONST 0 (0)
12 CALL_FUNCTION 2
15 LOAD_CONST 1 (1)
18 BINARY_ADD
19 LOAD_NAME 0 (countDict)
22 LOAD_NAME 2 (w)
25 STORE_SUBSCR
26 LOAD_CONST 2 (None)
29 RETURN_VALUE

Looks like if you succeed most of the time without a KeyError, the try/except might be
a tad faster, depending on what the exception block setup/teardown costs.
... try: countDict[w] += 1
... except KeyError: countDict[w] = 1
... """, '', 'exec'))
1 0 SETUP_EXCEPT 20 (to 23)
3 LOAD_NAME 0 (countDict)
6 LOAD_NAME 1 (w)
9 DUP_TOPX 2
12 BINARY_SUBSCR
13 LOAD_CONST 0 (1)
16 INPLACE_ADD
17 ROT_THREE
18 STORE_SUBSCR
19 POP_BLOCK
20 JUMP_FORWARD 29 (to 52)

2 >> 23 DUP_TOP
24 LOAD_NAME 2 (KeyError)
27 COMPARE_OP 10 (exception match)
30 JUMP_IF_FALSE 17 (to 50)
33 POP_TOP
34 POP_TOP
35 POP_TOP
36 POP_TOP
37 LOAD_CONST 0 (1)
40 LOAD_NAME 0 (countDict)
43 LOAD_NAME 1 (w)
46 STORE_SUBSCR
47 JUMP_FORWARD 2 (to 52) 55 RETURN_VALUE

Regards,
Bengt Richter
 
S

Sybren Stuvel

Bengt Richter enlightened us with:
I meant somestring.split() just like that -- without a splitter
argument. My suspicion remains ;-)

Mine too ;-)

Sybren
 
R

Ron Adam

Fredrik said:
Lonnie Princehouse wrote:

"[a-z0-9_]" means "match a single character from the set {a through z,
0 through 9, underscore}".


"\w" should be a bit faster; it's equivalent to "[a-zA-Z0-9_]" (unless you
specify otherwise using the locale or unicode flags), but is handled more
efficiently by the RE engine.

(you can use \w in sets too, so you can do e.g. "[\w@]")

</F>

The \w does make a small difference, but not as much as I expected.
Surprisingly a standard Python word iterator works just as well, and is
easier to understand than the re version.

Which one is faster depends on the average word length and number of
ignored characters.

Cheers,
Ron




Character count: 100000
Word count: 16477
Average word size: 6.06906597075
word_counter: 0.06820057 (best of 3)
count_words: 0.07333837 (best of 3)


#########

import string
import re
import time
import random


# Create a really ugly n length string to test with.
n = 100000
random.seed(1)
lines = ''.join([ random.choice(string.ascii_letters * 2
+ '_@$&*()#/<>' + ' \n' * 6) for x in range(n) ])
print 'Character count:', n
print 'Word count:', len(lines.split())
print 'Average word size:', float(n)/len(lines.split())



letters = string.lowercase + string.digits + '_@'

def word_iter(text, letters=letters):
ltrs=letters.lower()
wd = ''
for c in text + ' ':
if c in ltrs:
wd += c
elif wd != '':
yield wd
wd = ''

def word_counter(text):
txt = text.lower()
countDict={}
for wd in word_iter(txt):
if wd in countDict:
countDict[wd] += 1
else:
countDict[wd] = 1
return countDict




word_finder = re.compile('[\w@]+', re.I)

def count_words(string, word_finder = word_finder):
# avoid global lookups
countDict = {}
for match in word_finder.finditer(string.lower()):
word = match.group(0)
countDict[word] = countDict.get(word,0) + 1
return countDict



foos = [word_counter, count_words]
r1 = r2 = None
for foo in foos:
best_time = 0
for n in range(3):
t = time.clock()
for line in lines.splitlines():
countDict = foo(line)
tt = time.clock()-t
if tt > best_time: best_time = tt

r1 = r2
r2 = countDict
if r1 != None:
# change to 1 if assert fails to find problem
if 0:
for k in r1.keys():
if r1[k] != r2[k]:
print k,r1[k],r2[k]
assert r1 == r2

print '%s: %.8f (best of %d)' \
% (foo.__name__, best_time, n+1)
 
F

Fredrik Lundh

Ron said:
The \w does make a small difference, but not as much as I expected.

that's probably because your benchmark has a lot of dubious overhead:
word_finder = re.compile('[\w@]+', re.I)

no need to force case-insensitive search here; \w looks for both lower-
and uppercase characters.
for match in word_finder.finditer(string.lower()):

since you're using a case-insensitive RE, that lower() call is not necessary.
word = match.group(0)

and findall() is of course faster than finditer() + m.group().
t = time.clock()
for line in lines.splitlines():
countDict = foo(line)
tt = time.clock()-t

and if you want performance, why are you creating a new dictionary for
each line in the sample?

here's a more optimized RE word finder:

word_finder_2 = re.compile('[\w@]+').findall

def count_words_2(string, word_finder=word_finder_2):
# avoid global lookups
countDict = {}
for word in word_finder(string):
countDict[word] = countDict.get(word,0) + 1
return countDict

with your original test on a slow machine, I get

count_words: 0.29868684 (best of 3)
count_words_2: 0.17244873 (best of 3)

if I call the function once, on the entire sample string, I get

count_words: 0.23096036 (best of 3)
count_words_2: 0.11690620 (best of 3)

</F>
 
R

Ron Adam

Fredrik said:
Ron Adam wrote:




that's probably because your benchmark has a lot of dubious overhead:

I think it does what the OP described, but that may not be what he
really needs.

Although the test to find best of n, instead was finding worse of n.
Which explains why I was getting a larger variance than I thought I
should have been getting.

word_finder = re.compile('[\w@]+', re.I)


no need to force case-insensitive search here; \w looks for both lower-
and uppercase characters.

But the dictionary keys need to be either upper or lower otherwise you
count 'The' separately from 'the'.

since you're using a case-insensitive RE, that lower() call is not necessary.




and findall() is of course faster than finditer() + m.group().

Cool, I don't use re that often so I just used what was posted to test
against.
and if you want performance, why are you creating a new dictionary for
each line in the sample?

Because that's what the OP apparently wanted. A line by line word
count. I originally did it to get an the over all count and then change
it so it matched the re version that was posted.
here's a more optimized RE word finder:

word_finder_2 = re.compile('[\w@]+').findall

def count_words_2(string, word_finder=word_finder_2):
# avoid global lookups
countDict = {}
for word in word_finder(string):
countDict[word] = countDict.get(word,0) + 1
return countDict

with your original test on a slow machine, I get

count_words: 0.29868684 (best of 3)
count_words_2: 0.17244873 (best of 3)

if I call the function once, on the entire sample string, I get

count_words: 0.23096036 (best of 3)
count_words_2: 0.11690620 (best of 3)

</F>

Wow, a lot bigger difference than on my machine. <curious> An athlon
64 3000+ on winxp. I'm not sure how much difference that would make?

This is what I get after adding the above version to it, with the
lower(). There's not quite as big a difference as you get, but the find
all version is still faster than both the others.

Cheers,
Ron


Character count: 100000
Word count: 16477
Average word size: 6.06906597075
word_counter: 0.06245989 (best of 3)
count_words: 0.07309812 (best of 3)
count_words_2: 0.04981024 (best of 3)


And as count all words...

Character count: 100000
Word count: 16477
Average word size: 6.06906597075
word_counter: 0.05325006 (best of 3)
count_words: 0.05910528 (best of 3)
count_words_2: 0.03748158 (best of 3)


They all improve, but the re find all version is clearly better.



#####################

import string
import re
import time
import random

# Create a really ugly n length string to test with.
# The word length are
n = 100000
random.seed(1)
lines = ''.join([ random.choice(string.ascii_letters * 2
+ '_@$&*()#/<>' + ' \n' * 6) for x in range(n) ])
print 'Character count:', n
print 'Word count:', len(lines.split())
print 'Average word size:', float(n)/len(lines.split())


letters = string.lowercase + string.digits + '_@'

def word_iter(text, letters=letters):
wd = ''
for c in text + ' ':
if c in letters:
wd += c
elif wd != '':
yield wd
wd = ''

def word_counter(text):
countDict={}
for wd in word_iter(text.lower()):
if wd in countDict:
countDict[wd] += 1
else:
countDict[wd] = 1
return countDict


word_finder = re.compile('[\w@]+', re.I).finditer

def count_words(string, word_finder=word_finder):
# avoid global lookups
countDict = {}
for match in word_finder(string.lower()):
word = match.group(0)
countDict[word] = countDict.get(word,0) + 1
return countDict


word_finder_2 = re.compile('[\w@]+').findall

def count_words_2(string, word_finder=word_finder_2):
# avoid global lookups
countDict = {}
for word in word_finder(string.lower()):
countDict[word] = countDict.get(word,0) + 1
return countDict


foos = [word_counter, count_words, count_words_2]
r1 = r2 = None
for foo in foos:
best_time = 1000000 # too large to be useful on purpose
for n in range(3):
t = time.clock()
#for line in lines.splitlines():
countDict = foo(lines)
tt = time.clock()-t
best_time = min(tt, best_time)

r1 = r2
r2 = countDict
if r1 != None:
# change to 1 if assert fails to find problem
if 0:
for k in r1.keys():
if r1[k] != r2[k]:
print k,r1[k],r2[k]
assert r1 == r2

print '%s: %.8f (best of %d)' \
% (foo.__name__, best_time, n+1)
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,767
Messages
2,569,572
Members
45,045
Latest member
DRCM

Latest Threads

Top