Crude statistics on the standard library


F. Petitjean

I have written a script to find the modules which export the largest
number of names. The gc.getreferrers(*objs) function gives also an idea
of the dependencies between the modules.

The code ( :

#!/usr/bin/env python
# -*- coding: latin-1 -*-

""" module rudimentaire de statistiques des noms exportés
par les modules de la bibliothèque standard

import sys
import gc

from glob import glob
import os, os.path
from os.path import basename

def browse_stdlib():
"""browse the standard library
returns list of names of modules
pyver = 'python%s' % (sys.version[:3],)
pyglob = os.path.join(sys.prefix, 'lib', pyver, '*.py')
# lpys = glob(pyglob)
if os.path.exists(os.path.join(sys.prefix, 'Lib', 'os.pyc')):
pyglob = os.path.join(sys.prefix, 'Lib', '*.py')
lpys = map(basename, glob(pyglob))
names = [ name[:-3] for name in lpys ]
# remove some obsolete modules ('this' + DeprecationWarning)
for dontparse in ("this", "tzparse", 'FCNTL', 'posixfile', 'pre', 'regsub',
'statcache', 'TERMIOS', 'xmllib'):
except ValueError:
return names

def exports(names, with_modules=False):
"""imports all the modules in names
returns a 2-tuple :
- list of tuples : NumberOfExternalNames len(dir(module)) nodname
- list of modules (if with_modules is true)
res = []
add = res.append
_all = []
modules = []
# this simple minded method (__import__) doesn't include sys ?
for name in names:
print name, " ",
module = __import__(name, globals(), locals(), _all)
ldir = len(dir(module))
if hasattr(module, '__all__'):
nexports = len(module.__all__)
nexports = ldir
add((nexports, ldir, name))
if with_modules:
# del sys.modules[name]
except ImportError, msg:
print "cannot import module", name, msg
return res, modules

def pm_histo(values, nbins=20):
"""a poor man histogram
Return a list of nbins tuples (left, right) such that
the union of the consecutive ranges(left, right) is range(len(values)+1)
vlo, vhi = values[0], values[-1]+1
nbins = min(nbins, vhi-vlo)
deltax = int((vhi - vlo)/nbins)
assert deltax > 0
ranges = []
add = ranges.append
left = 0 # left index first bin
val = vlo + deltax
while val < vhi:
for right in range(left, len(values)):
if values
> val:
add((left, right))
left = right
val = val + deltax
return ranges

def basic_stat(seq):
"""basic statistics on the values in seq
Returns NumberOfItems, MeanValue, StandardDeviation, variance
s0, s1, s2 = 0, 0, 0
for indx, item in enumerate(seq):
s0 = s0 + 1 # seq may be an iterable without len
Xi = float(item)
if not indx:
Xmin = Xi
s1 = s1 + Xi
s2 = s2 + Xi*Xi
# s0 = len(seq) # sum of 0 order
Xm = s1/s0 # mean value
Xmax = Xi
median = (Xmin + Xmax)*0.5
variance = (s2 - s0*Xm*Xm)/s0 # ecart-type ** 2
import math
stddev = math.sqrt(variance) # ecart-type
return s0, Xmin, Xmax, median, Xm, stddev # , variance

if __name__ == '__main__':
names = ['cStringIO', 'sys', 'gc' ]
freqs, modules = exports(names, True)
print # exports() prints without new line
print "%d imported modules and %d in sys.modules" % (
len(freqs), len(sys.modules))

print "number of unreachable objects", gc.collect()
simples = []
while modules:
module = modules.pop()
# print module.__name__, sys.getrefcount(module)
items = gc.get_referrers(module)
litems = len(items)
if litems <= 2:
del sys.modules[module.__name__], module, items
print "referrers of %s" % (module.__name__,)
for item in items[2:]:
name = item.get('__file__', 'unknown')
if name.endswith('__init__.pyc'):
pslash = name.rfind(os.sep)
pslash = name[:pslash].rfind(os.sep)
name = name[pslash+1:][:-4] # strip .pyc
elif name.endswith(''):
pslash = name.rfind(os.sep)
pslash = name[:pslash].rfind(os.sep)
name = name[pslash+1:][:-3] # strip .py
elif name.endswith('.pyc'):
pslash = name.rfind(os.sep)
name = name[pslash+1:][:-4] # strip .pyc
elif name.endswith('.py'):
pslash = name.rfind(os.sep)
name = name[pslash+1:][:-3] # strip .py
print name,
del module, items

print "number of unreachable objects", gc.collect()
print "new length of sys.modules %d" % (len(sys.modules),)
print "%d simple modules" % (len(simples),)
values = [item[0] for item in freqs ]
# print freqs[-2:] # supprimés
# del values[-2:]
ranges = pm_histo(values)
ranges2 = [ item for item in ranges if item[1] > item[0]]
limite = ranges[0][1] + 1 # first bin
rangesbas = pm_histo(values[:95], 6)
print rangesbas
lbin = 11
start = 0
print "St Nb. min max median average stddev"
fmt = "%3d%3d%6.1f%6.1f%8.3f%8.3f%8.3f"
while start < len(values):
res = (start,) + basic_stat(values[start:start+lbin])
print fmt % res
start = start + lbin

print "modules with a lot of external names :"
for item in freqs[140:]:
print item

Parts of output of python -i (python2.4 windows)
repr rexec rfc822 rlcompleter cannot import module rlcompleter
No module named readline robotparser sched
.... etc ...
whrandom C:\Python24\lib\ DeprecationWarning: the
whrandom module is deprecated; please use the random module
xdrlib xmlrpclib zipfile
.... etc ...
_threading_local __future__ Hello world...
cannot import module No module named foo

cannot import module No module named foo
number of unreachable objects 0
referrers of __future__
referrers of __future__
.... etc ...
referrers of socket
asynchat asyncore BaseHTTPServer SocketServer urllib httplib ftplib
imaplib nntplib poplib smtpd smtplib Utils
.... etc ...
referrers of cStringIO
logging\__init__ xmlrpclib
number of unreachable objects 564
new length of sys.modules 154
121 simple modules
[(0, 39), (39, 58), (58, 74), (74, 79), (79, 91), (91, 94)]
St Nb. min max median average stddev
0 11 1.0 1.0 1.000 1.000 0.000
11 11 1.0 2.0 1.500 1.545 0.498
22 11 2.0 2.0 2.000 2.000 0.000
33 11 2.0 3.0 2.500 2.455 0.498
44 11 3.0 3.0 3.000 3.000 0.000
55 11 3.0 4.0 3.500 3.727 0.445
66 11 4.0 5.0 4.500 4.273 0.445
77 11 5.0 6.0 5.500 5.818 0.386
88 11 6.0 8.0 7.000 6.909 0.668
99 11 9.0 10.0 9.500 9.545 0.498
110 11 10.0 12.0 11.000 11.182 0.716
121 11 12.0 16.0 14.000 13.818 1.113
132 11 16.0 21.0 18.500 17.636 1.367
143 11 21.0 29.0 25.000 24.818 2.367
154 11 31.0 51.0 41.000 38.364 7.413
165 11 55.0 92.0 73.500 70.636 10.764
176 5 97.0 136.0 116.500 111.400 13.865
modules with a lot of external names :
(18, 40, 'cgi')
(19, 19, 'cgitb')
.... etc ...
(72, 72, 'pydoc')
(74, 74, 'cookielib')
(78, 78, 'urllib2')
(86, 86, 'symbol')
(92, 92, 'sre_constants')
(97, 97, 'xmlrpclib')
(101, 118, 'os')
(107, 107, 'sre_compile')
(116, 116, 'sre_parse')
(136, 151, 'socket')

Output with python 2.3.3 Linux gives a greater number for socket as the
OpenSSL library is wrapped.
gc.collect() at the interactive prompt gives 0. (good)

Conclusion :
sre_compile and sre_parse should be coded with a __all__ attribute
The standard library contains a module 'tzparse' which cannot be imported !
Most library modules do not begin with #!/usr/bin/env python and a
coding cookie.



F. Petitjean wrote:
Conclusion :
sre_compile and sre_parse should be coded with a __all__ attribute

Problem with this is that it would change the API for the two modules.
And the main reason for the dependencies is that sre_constants is
import-star'ed; same with sre_constants.

But yes, it wouldn't hurt to lower them. But then again people are not
supposed to be using these modules directly; they are there to provide
support to the re module.
The standard library contains a module 'tzparse' which cannot be
imported !

It can, but you must have the 'TZ' environment variable set. It's
deprecated and has been moved to lib-old as of Python 2.5 .
Most library modules do not begin with #!/usr/bin/env python and a
coding cookie.

Not all modules are meant to be run as a script. Plus, with the advent
of the '-m' argument for the interpreter it really isn't necessary.
And as for the encoding cookie, most modules have been around much
longer than that feature so they are almost all ASCII encoded.


Fredrik Lundh

F. Petitjean said:
sre_compile and sre_parse should be coded with a __all__ attribute

they're implementation modules, and shouldn't be used by user code.


