I
i80and
I'm working on a basic web spider, and I'm having problems with the
urlparser.
This is the effected function:
------------------------------
def FindLinks(Website):
WebsiteLen = len(Website)+1
CurrentLink = ''
i = 0
SpliceStart = 0
SpliceEnd = 0
LinksString = ""
LinkQueue = open('C:/LinkQueue.txt', 'a')
while (i < WebsiteLen) and (i != -1):
#Debugging info
#print '-----'
#print 'Length = ' + str(WebsiteLen)
#print 'SpliceStart = ' + str(SpliceStart)
#print 'SpliceEnd = ' + str(SpliceEnd)
#print 'i = ' + str(i)
SpliceStart = Website.find('<a href="', (i+1))
SpliceEnd = (Website.find('">', SpliceStart))
ParsedURL =
urlparse((Website[SpliceStart+9SpliceEnd+1)]))
robotparser.set_url(ParsedURL.hostname + '/' +
'robots.txt')
robotparser.read()
if (robotparser.can_fetch("*",
(Website[SpliceStart+9SpliceEnd+1)])) == False):
i = i - 1
else:
LinksString = LinksString + "\n" +
(Website[SpliceStart+9SpliceEnd+1)])
LinksString = LinksString[len(LinksString) - 1)]
#print 'found ' + LinksString
i = SpliceEnd
LinkQueue.write(LinksString)
LinkQueue.close()
------------------------------
Sorry if it's uncommented. When I run my program, I get this error:
-----
Traceback (most recent call last):
File "C:/Documents and Settings/Andrew/Desktop/ScoutCode-0.09.py",
line 120, in <module>
FindLinks(Website)
File "C:/Documents and Settings/Andrew/Desktop/ScoutCode-0.09.py",
line 84, in FindLinks
robotparser.read()
File "C:\Program Files\Python25\lib\robotparser.py", line 61, in read
f = opener.open(self.url)
File "C:\Program Files\Python25\lib\urllib.py", line 190, in open
return getattr(self, name)(url)
File "C:\Program Files\Python25\lib\urllib.py", line 451, in
open_file
return self.open_local_file(url)
File "C:\Program Files\Python25\lib\urllib.py", line 465, in
open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] The system cannot find the path specified:
'en.wikipedia.org\\robots.txt'
Note the last line 'en.wikipedia.org\\robots.txt'. I want
'en.wikipedia.org/robots.txt'! What am I doing wrong?
If this has been answered before, please just give me a link to the
proper thread. If you need more contextual code, I can post more.
urlparser.
This is the effected function:
------------------------------
def FindLinks(Website):
WebsiteLen = len(Website)+1
CurrentLink = ''
i = 0
SpliceStart = 0
SpliceEnd = 0
LinksString = ""
LinkQueue = open('C:/LinkQueue.txt', 'a')
while (i < WebsiteLen) and (i != -1):
#Debugging info
#print '-----'
#print 'Length = ' + str(WebsiteLen)
#print 'SpliceStart = ' + str(SpliceStart)
#print 'SpliceEnd = ' + str(SpliceEnd)
#print 'i = ' + str(i)
SpliceStart = Website.find('<a href="', (i+1))
SpliceEnd = (Website.find('">', SpliceStart))
ParsedURL =
urlparse((Website[SpliceStart+9SpliceEnd+1)]))
robotparser.set_url(ParsedURL.hostname + '/' +
'robots.txt')
robotparser.read()
if (robotparser.can_fetch("*",
(Website[SpliceStart+9SpliceEnd+1)])) == False):
i = i - 1
else:
LinksString = LinksString + "\n" +
(Website[SpliceStart+9SpliceEnd+1)])
LinksString = LinksString[len(LinksString) - 1)]
#print 'found ' + LinksString
i = SpliceEnd
LinkQueue.write(LinksString)
LinkQueue.close()
------------------------------
Sorry if it's uncommented. When I run my program, I get this error:
-----
Traceback (most recent call last):
File "C:/Documents and Settings/Andrew/Desktop/ScoutCode-0.09.py",
line 120, in <module>
FindLinks(Website)
File "C:/Documents and Settings/Andrew/Desktop/ScoutCode-0.09.py",
line 84, in FindLinks
robotparser.read()
File "C:\Program Files\Python25\lib\robotparser.py", line 61, in read
f = opener.open(self.url)
File "C:\Program Files\Python25\lib\urllib.py", line 190, in open
return getattr(self, name)(url)
File "C:\Program Files\Python25\lib\urllib.py", line 451, in
open_file
return self.open_local_file(url)
File "C:\Program Files\Python25\lib\urllib.py", line 465, in
open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] The system cannot find the path specified:
'en.wikipedia.org\\robots.txt'
Note the last line 'en.wikipedia.org\\robots.txt'. I want
'en.wikipedia.org/robots.txt'! What am I doing wrong?
If this has been answered before, please just give me a link to the
proper thread. If you need more contextual code, I can post more.