A
abhinav
//A CRAWLER IMPLEMENTATION
please run this prog. on the shell and under the control of debugger
when this prog. is run normally the prog. does not terminate .It
doesn't come out of the cond. if c<5: so this prog. continues
infinitely
but if this prog is run under the control of debugger the prog
terminates when the cond. if c<5: becomes false
i think this prob. may be due to multithreading pls help.
from sgmllib import SGMLParser
import threading
import re
import urllib
import pdb
import time
class urlist(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.list=[]
def start_a(self,attr):
href=[v for k,v in attr if k=="href"]
if href:
self.list.extend(href)
mid=2
c=0
class mythread(threading.Thread):
stdmutex=threading.Lock()
global threads
threads=[]
def __init__(self,u,myid):
self.u=u
self.myid=myid
threading.Thread.__init__(self)
def run(self):
global c
global mid
if c<5:
self.stdmutex.acquire()
self.usock=urllib.urlopen(self.u)
self.p=urlist()
self.s=self.usock.read()
self.p.feed(self.s)
self.usock.close()
self.p.close()
c=c+1
fname="/root/" + str(c) + ".txt"
self.f=open(fname,"w")
self.f.write(self.s)
self.f.close()
print c
print self.p.list
print self.u
print self.myid
for j in self.p.list:
k=re.search("^https?:",j)
if k:
i=mythread(j,mid)
i.start()
threads.append(i)
mid=mid+1
self.stdmutex.release()
if __name__=="__main__":
thread=mythread("http://www.google.co.in/",1)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
print "main thread exits"
please run this prog. on the shell and under the control of debugger
when this prog. is run normally the prog. does not terminate .It
doesn't come out of the cond. if c<5: so this prog. continues
infinitely
but if this prog is run under the control of debugger the prog
terminates when the cond. if c<5: becomes false
i think this prob. may be due to multithreading pls help.
from sgmllib import SGMLParser
import threading
import re
import urllib
import pdb
import time
class urlist(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.list=[]
def start_a(self,attr):
href=[v for k,v in attr if k=="href"]
if href:
self.list.extend(href)
mid=2
c=0
class mythread(threading.Thread):
stdmutex=threading.Lock()
global threads
threads=[]
def __init__(self,u,myid):
self.u=u
self.myid=myid
threading.Thread.__init__(self)
def run(self):
global c
global mid
if c<5:
self.stdmutex.acquire()
self.usock=urllib.urlopen(self.u)
self.p=urlist()
self.s=self.usock.read()
self.p.feed(self.s)
self.usock.close()
self.p.close()
c=c+1
fname="/root/" + str(c) + ".txt"
self.f=open(fname,"w")
self.f.write(self.s)
self.f.close()
print c
print self.p.list
print self.u
print self.myid
for j in self.p.list:
k=re.search("^https?:",j)
if k:
i=mythread(j,mid)
i.start()
threads.append(i)
mid=mid+1
self.stdmutex.release()
if __name__=="__main__":
thread=mythread("http://www.google.co.in/",1)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
print "main thread exits"