code debugging

G

golu

here is a code which crawls links sent to it. theres some problem with
the retrieve_url function ,plz help me out in debugging the fuction
retrive_url. This function retrives pages and saves them in file
#TODO:Visited dict grows in size it needs to be handled smartly
#Moreover server program needs to be in sync with the client eg.
Myrobot
#Take care of tag - 'if modified since',repeated links,hash links
#This is the client side of the distributed crawling framework
#It gets the list of urls to be crawled
#Then crawls the urls and stores the pages in a temporary archive
#which is then transferred to the server or grey_matter
import httplib
import os
import sys
import urlparse
import urllib2
import urllib
import zipfile
import threading

from socket import *
PAGE_DIR="C:/users/jayesh/
pages/" # directory where the
web pages are stored temporarily

# before transfer to the grey_matter
visited=
{} # a
dict to remember visited urls
ROBOT_COUNT=4


def fget():
""" This function retrieves the zipped file
containing the list of urls from the grey_matter and
saves them in a local file 'list.txt'. """

httplib.HTTPConnection.debuglevel=1
request=urllib2.Request('http://192.168.153.57/list.zip')
#Requesting the zipped file
request.add_header('Accept-encoding','gzip') #containing
the list of urls
opener=urllib2.build_opener()
flag=1
s='Waiting for server'
while flag==1:
try:
op=opener.open(request)
flag=0
except:
s=s+'*'
print s
f=open('list.zip',"wb")
f.write(op.read())
f.close()
z=zipfile.ZipFile('list.zip')
p=z.namelist()
g=open('list.txt',"wb")
g.write(z.read(p[0]))
g.close()
print 'got zipped file'

def compress():
""" This function compresses the crawled pages and stores them in
a single compressed file ready to be sent to the
grey_matter."""

zfile=zipfile.ZipFile('C:/xampp/htdocs/pages.zip',mode='w')
for fil in os.listdir(PAGE_DIR):
full=os.path.join(PAGE_DIR,fil)
zfile.write(full,fil)
os.remove(full)
os.rmdir(PAGE_DIR) #Removing the directory after
transfer to grey_matter


x=0
class robot(threading.Thread):
""" The main robot class which does the crawling of listed
urls it recieves from the grey matter. It uses 3 threads which
crawl the listed urls synchronously."""

def __init__(self,urllist,urllistlock,dblock):
threading.Thread.__init__(self)
self.urllist=urllist
self.urllistlock=urllistlock
self.dblock=dblock

def popurl(self):
""" This method pops out urls from the urls file one by one
and sends them for retrieval."""

self.urllistlock.acquire(1)
if(len(self.urllist)<1):
Nexturl=None
else:
Nexturl=self.urllist[0]
if Nexturl[-1]=='\n':Nexturl=Nexturl[:-1]
del self.urllist[0]
self.urllistlock.release()
return Nexturl

def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""
global x
if url is not None:

try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
filepath=filepath[1:]
filename=filepath.split("/")[-1]
else:
filename=x+'.htm'
x+=1

path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#') #temporary
if p!=-1:
url=url[:p]

visited=1
m=urllib2.urlopen(url)

fopen=open(path,'wb')

fopen.seek(0)
fopen.write(url+'|')

fopen.write(m.read())
fopen.close()
print url ,'retrieved'

except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"

return

def run(self):
while(1):
url=self.popurl()
if url is None:
break
try:
self.retrieve_url(url)
except:sys.exit()

if __name__=='__main__':

s=socket(AF_INET,SOCK_STREAM)
s.bind(('',444))
s.listen(5)
q,v=s.accept()
count=1
print 'Connecting...'
while 1:
print 'Phase: %s' %(count)
message=q.recv(3)

if(message!='yes'):continue
print 'Connected'
count=count+1
fget() # Calling the fget method to get the url list
from
# grey_matter(server).
try:
os.mkdir(PAGE_DIR)
except: print 'Cant make dir'
try:
f=open('list.txt','r')
urllist=f.readlines()
f.close()
except:
print 'Error opening urls file'
sys.exit()
print 'startting threads'
urllistlock=threading.Lock()
dblock=threading.Lock()
botlist=[]
for X in range(0,ROBOT_COUNT):
newbot=robot(urllist,urllistlock,dblock)
newbot.setName('X')
botlist.append(newbot)
newbot.start()

for X in range(0,ROBOT_COUNT):
botlist[X].join()

compress()
try:
q.send('yes')
except:
print 'socket disconnected'
print sys.exit()
 
C

Chris Rebert

here is a code which crawls links sent to it. theres some problem with
the retrieve_url function ,plz help me out in debugging the fuction
retrive_url. This function retrives pages and saves them in file

Please specify exactly what the problem is that you are experiencing.
If you are getting an error, please provide the error message and full
traceback.

Cheers,
Chris
 
G

golu

Please specify exactly what the problem is that you are experiencing.
If you are getting an error, please provide the error message and full
traceback.

Cheers,
Chris
--http://blog.rebertia.com

i want to save pages in a directory and i m using the urls to get
filenames. The program gets stuck in the saving step.can u suggest me
a way to save a page e.g google.com as a file google.html
 
G

Gabriel Genellina

i want to save pages in a directory and i m using the urls to get
filenames. The program gets stuck in the saving step.can u suggest me
a way to save a page e.g google.com as a file google.html

You may use str.translate to replace/remove all undesired characters:

py> import string
py> valid = string.ascii_letters+string.digits+'.'
py> invalid = ''.join(chr(x) for x in range(256) if chr(x) not in valid)
py> table = string.maketrans(invalid, '_'*len(invalid))
py> x = 'http://docs.python.org/library/string.html'
py> x.translate(table)
'http___docs.python.org_library_string.html'

See http://docs.python.org/library/stdtypes.html#str.translate
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Similar Threads


Members online

No members online now.

Forum statistics

Threads
473,770
Messages
2,569,584
Members
45,075
Latest member
MakersCBDBloodSupport

Latest Threads

Top