Help on thread pool

A

Alex

Hi all.

In order to understand the concept of threading pool in python I'm
working on a simple single-site web crawler.
I would like to stop the program when the threading pool have
downloaded all internal links from a web site, but now my program keep
waiting forever even if there are no more links to download.

Here's my code, I appreciate any comments, I'm programming just for
fun and learning ;-)

Thanks in advance.

from BeautifulSoup import BeautifulSoup
import urllib
from pprint import pprint
import string
from urlparse import urlparse
import sys
from threading import Thread
import time
from Queue import Queue

#dirty hack: set default encoding to utf-8
reload(sys)
sys.setdefaultencoding('utf-8')

opener = urllib.FancyURLopener({})

class Crawler:

def __init__(self):
"""
Constructor
"""
self.missed = 0
self.url_list = []
self.urls_queue = Queue()
self.num_threads = 5

self._create_threads()

def get_internal_links(self,url):
"""
Get all internal links from a web page and feed the queue
"""
self.url = url
url_netloc = urlparse(self.url).netloc
print "Downloading... ", self.url
time.sleep(5)
try:
p = opener.open(self.url)
#print p.info()
except IOError:
print "error connecting to ", self.url
print "wait..."
time.sleep(5)
print "retry..."
try:
p = urllib.urlopen(self.url)
except IOError:
self.missed = self.missed + 1
return None

html = p.read()
soup = BeautifulSoup(html)
anchors = soup.findAll('a')
links = [ str(anchor['href']) for anchor in anchors]
internal_links = [link for link in links if
(urlparse(link).netloc == url_netloc)]

for link in internal_links:
if link not in self.url_list and link != self.url:
self.url_list.append(link)
self.urls_queue.put(link)
print "Queue size: ", self.urls_queue.qsize()
print "List size: ", str(len(self.url_list))
print "Errors: ", str(self.missed)
self._queue_consumer()


def _queue_consumer(self):
"""
Consume the queue
"""
while True:
url = self.urls_queue.get()
print 'Next url: ', url
self.get_internal_links(url)
self.urls_queue.task_done()


def _create_threads(self):
"""
Set up some threads to fetch pages
"""
for i in range(self.num_threads):
worker = Thread(target=self._queue_consumer, args=())
worker.setDaemon(True)
worker.start()

#-----------------------------------------------------------------------------
#

if __name__ == '__main__':

c = Crawler()
c.get_internal_links('http://www.thinkpragmatic.net/')
 
J

Jeff

Your worker threads wait around forever because there is no place for
them to exit. Queue.get() by default blocks until there is an item in
the queue available. You can do something like this to cause the
worker to quit when the queue is empty. Just make sure that you fill
the queue before starting the worker threads.

from Queue import Queue, Empty

# in your worker
while True:
try:
item = q.get(block=False)
except Empty:
break
do_something_with_item()
q.task_done()

You can also use a condition variable and a lock or a semaphore to
signal the worker threads that all work has completed.
 
A

Alex

Your worker threads wait around forever because there is no place for
them to exit. Queue.get() by default blocks until there is an item in
the queue available. You can do something like this to cause the
worker to quit when the queue is empty. Just make sure that you fill
the queue before starting the worker threads.

from Queue import Queue, Empty

# in your worker
while True:
try:
item = q.get(block=False)
except Empty:
break
do_something_with_item()
q.task_done()

You can also use a condition variable and a lock or a semaphore to
signal the worker threads that all work has completed.

Thanks a lot, it works!


Alex
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,755
Messages
2,569,536
Members
45,015
Latest member
AmbrosePal

Latest Threads

Top