import java.net.*;
import java.io.*;
import java.util.regex.*;
import java.util.*;
public class PullUrl3
{
final static boolean DEBUG=false;
static Hashtable urls = new Hashtable();
public static void main(String [] args)
{
String rootString = "
http://etext.lib.virginia.edu/koran.html";
ArrayList baseListing = getLinks(rootString,rootString);
if(!baseListing.isEmpty())
{
Driller(rootString, baseListing);
}
System.out.println("Done");
}
public static void Driller(String thebase, ArrayList urlListing)
{
for(Iterator c = urlListing.iterator();c.hasNext()

{
String singleURL="";
String newBaseString = "";
singleURL=(String) c.next();
Pattern pattern = Pattern.compile("
http://.*?/", Pattern.DOTALL);
Matcher matcher = pattern.matcher(singleURL);
if(matcher.find())
{
newBaseString = matcher.group();
//System.out.println("newBaseString" + newBaseString);
}
else
{
continue;
}
ArrayList newBase = getLinks(newBaseString, singleURL);
if(!newBase.isEmpty())
{
//System.out.println("newBaseString" + newBaseString);
//System.out.println(singleURL);
Driller(newBaseString, newBase);
}
else
{
//System.out.println("newBaseString" + newBaseString);
//System.out.println(singleURL);
}
//if have listing get it and pass back to driller
//if does not have listing leave alone and show
}
}
public static ArrayList getLinks(String baseString, String theurl)
{
ArrayList returnThis = new ArrayList();
StringBuffer strbuffer = new StringBuffer();
try
{
URL u = new URL( baseString);
HttpURLConnection huc = (HttpURLConnection) u.openConnection();
huc.setRequestMethod("GET");
huc.setDoInput(true);
huc.setDoOutput(false);
huc.setUseCaches(false);
huc.connect();
InputStream inputStream = huc.getInputStream();
BufferedInputStream bis = new BufferedInputStream(inputStream);
while(true)
{
int cint = bis.read();
if(cint == -1)
{
break;
}
strbuffer.append((char)cint);
}
huc.disconnect();
Pattern pattern = Pattern.compile("href=\".*?\"", Pattern.DOTALL);
Matcher matcher = pattern.matcher(strbuffer);
String fullUrl = "";
while(matcher.find())
{
fullUrl = fullURL(baseString, removeHref(matcher.group()));
//System.out.println(fullUrl);
// check if anchor
if(fullUrl.indexOf('#') == -1)
{
// check if in database
if(urls.put(fullUrl, fullUrl) == null)
{
System.out.println(fullUrl);
returnThis.add(fullUrl);
}
else
{
//System.out.println(fullUrl + ": already there");
}
}
}
}
catch (IOException e)
{
System.out.println("Error : "+e);
}
return returnThis;
}
public static String fullURL(String baseString, String value)
{
// case # anchor in page - # at char 0
// case relateive url - virtual directory ~ - remove ~.*?/
// case relative url - /at the beginning
// case full url - http:// at the beginning
// case non http protocol urls - mailto ftp
// make sure to check if slash at end of string before appending
// if find url foundation/blah.html should check to see if
// - contains forward slash
baseString = (baseString.charAt(baseString.length()-1) == '/') ? baseString:baseString+"/";
String returnVal = "";
value = value.trim();
if(value.length() > 1)
{
switch(value.charAt(0))
{
case '#':
System.out.print(((DEBUG) ? "#\n" :"" ));
break;
case '/':
if(value.charAt(1)=='~')
{
Pattern patternVirtual = Pattern.compile("/~.*?/", Pattern.DOTALL);
Matcher matcherVirtual = patternVirtual.matcher(value);
value = matcherVirtual.replaceFirst("");
returnVal = baseString+value;
System.out.print(((DEBUG) ? "/1\n" :"" ));
break;
}
if(value.charAt(value.length()- 1) == '/')
{
System.out.print(((DEBUG) ? "/2\n" :"" ));
returnVal = baseString+value.substring(1,value.length());
}
else
{
System.out.print(((DEBUG) ? "/3\n" :"" ));
returnVal = baseString+value.substring(1,value.length());
}
break;
case 'h':
if(value.startsWith("http://"))
{
returnVal = value;
System.out.print(((DEBUG) ? "http\n" :"" ));
break;
}
case '~':
Pattern patternVirtual = Pattern.compile("~.*?/", Pattern.DOTALL);
Matcher matcherVirtual = patternVirtual.matcher(value);
value = matcherVirtual.replaceFirst("");
returnVal = baseString+value;
System.out.print(((DEBUG) ? "~\n" :"" ));
break;
default:
if(value.charAt(value.length()- 1) == '/')
{
System.out.print(((DEBUG) ? "~def1\n" :"" ));
returnVal = baseString+value.substring(1,value.length());
}
else
{
System.out.print(((DEBUG) ? "def2\n" :"" ));
returnVal = baseString+value;
}
}
}
return returnVal;
}
public static String removeHref(String value)
{
return value.substring(6,value.length() - 1 );
}
}