can anyone give me an example of screen scraping a website and return
the result e.g. html as a string
Here is a rough and ready screen scraper I wrote to pluck exchange
rates off the Oanda site. They siced the legal people on me so I had
to quickly desist using it. However, you could use it as a model.
This is not beautiful code. It is just to show proof of concept. It
uses unnecessarily low-level socket code. It was easier at the time
that trying to decipher how Sun's classes worked.
I offer this as is.
/* Oanda complained, so I had to discontinue this.
* Requests a currency conversion from Oanda.com
* I pretend to be a browser, get the page and extract the part I
want.
* copyright (c) 1998-2005 Roedy Green, Canadian Mind Products
* #327 - 964 Heywood Avenue
* Victoria, BC Canada V8V 2Y5
* tel: (250) 361-9093
* mailto:
[email protected]
*
http://mindprod.com
*/
package com.mindprod.currcon;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import com.mindprod.voter.CGIRequest;
import com.mindprod.voter.CGIget;
public class Oanda
{
/**
* undisplayed copyright notice
*/
private static final String EmbeddedCopyright =
"copyright (c) 2003-2005 Roedy Green, Canadian Mind Products,
http://mindprod.com";
// c o n f i g u r a t i o n s t r i n g s
/**
* Site URL to process the cgi script. without http:// on front
*/
final static String host = "
www.oanda.com";
/**
* Name of CGI Script to process this vote, namely the ACTION
parameter, without host.
* absolute name on host.
*/
final static String relativeURL = "/convert/fxdaily";
/**
* get list of currencies to fetch, and glue them together
separated with underscores.
*
* @return String of currency codes wanted, separated
* by underscores. e.g. CAD_USD_EUR
*/
public static String getWantedCurrencies () throws IOException
{
BufferedReader r = new BufferedReader ( new FileReader (
"oanda.wanted" ), 4096 );
StringBuffer sb = new StringBuffer ( 700 );
String line;
while ( ( line = r.readLine () ) != null )
{
sb.append ( "_" );
sb.append ( line );
}
return sb.toString().substring( 1 );
}
/**
* extract the useful CSV info out of the web page.
*
* @param haystack the entire webpage
* @return Extracted goodies, just the CSV data.
*/
static String extractGoodies ( String haystack )
{
/* Result that comes back in embedded a large web page.
We find it by
<PRE><font face=Verdana size=2>Currency,Code,USD/1 Unit,Units/1
USD
Canadian Dollar,CAD,0.7283,1.3737
Swiss Franc,CHF,0.7739,1.2933
British Pound,GBP,1.6325,0.6127
Japanese Yen,JPY,0.008562,116.9
</TD></TR></font></PRE>
*/
String lookFor = "<PRE><font face=Verdana
size=2>Currency,Code,USD/1 Unit,Units/1 USD";
int startGoodies = haystack.indexOf( lookFor );
if ( startGoodies < 0 )
{
System.out.println("failure. Oanda format change.");
System.exit(1);
return null;
}
else
{
// bypass junk on front
haystack = haystack.substring( startGoodies +
lookFor.length() + 2 );
int endGoodies = haystack.indexOf( "</TD>" );
if ( endGoodies < 0 )
{
System.out.println("failure. Oanda format change.");
System.exit(1);
return null;
}
return haystack.substring( 0, endGoodies );
}
}
/**
* Save the results in the oanda.csv file.
*
* @param result Results to save, in csv format.
*
* @exception IOException
*/
static void save ( String result ) throws IOException
{
// save result in oand.csv ready for further processing.
FileWriter w = new FileWriter ( "oanda.csv" );
w.write( result );
w.close();
}
/**
* Connect and send request
*/
public static void main (String[] args)
{
try
{
String currencies = getWantedCurrencies();
// prepare http parms to server and get return data
CGIRequest p = new CGIRequest( 2000 );
// order appears to matter.
p.appendCGIPair( "value", "1" );
// leave out date to get today's date.
p.appendCGIPair( "date_fmt", "jp" );
p.appendCGIPair( "redirected", "1" );
p.appendCGIPair( "result", "1" );
p.appendCGIPair( "lang", "en" );
p.appendCGIPair( "exch", "USD" );
p.appendCGIPair( "exch2", "" );
p.appendCGIPair( "expr2", "" );
p.appendCGIPair( "format", "CSV" );
p.appendCGIPair( "dest", "Get Table" );
p.appendCGIPair( "sel_list", currencies );
String parms = p.toString();
System.out.println( "sending request to oanda.com. Please be
patient." );
// ask oanda for exchange rates on given list of currencies.
String haystack = CGIget.get( host, relativeURL, parms );
/* extract good stuff from the whole web page */
String result = extractGoodies ( haystack );
System.out.println( result );
save ( result );
}
catch ( IOException e )
{
System.out.println(e);
}
} // end class Main
} // end class Oanda
package com.mindprod.voter;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
/**
* Like A StringBuffer but encodes CGI pairs.
*
* @author Roedy Green
* @version 1.0
* @since 2003-05-26
*/
public class CGIRequest
{
// ideally would extend StringBuffer, but it is final.
/**
* constructor
*
* @param size estimated size of result string.
*/
public CGIRequest ( int size )
{
this.sb = new StringBuffer (size);
}
private final StringBuffer sb;
/**
* append a parm=value pair of CGI parameters,
* ecoding them with URL encoding, xxx=yyy&aaa=bbb etc.
*
* @param name parameter name
*
* @param value parameter value
*/
public void appendCGIPair ( String name, String value )
{
if ( sb.length() != 0 )
{
// separates pairs
sb.append( '&' );
}
try
{
sb.append( URLEncoder.encode( name , "ASCII" ) );
sb.append ( '=' );
sb.append( URLEncoder.encode( value, "ASCII" ) );
}
catch ( UnsupportedEncodingException e )
{
throw new IllegalArgumentException("ASCII encoding support
missing");
}
}
/**
* get request as an a URL-encoded String.
*
* @return result CGI request string.
*/
public String toString()
{
return sb.toString();
}
}
// Class com/mindprod/voter/CGIpost.java
// copyright (c) 1998-2005 Roedy Green, Canadian Mind Products
// based on work by Jonathan Revusky
// To encode strings use java.net.URLEncoder.encode;
// and Java.net.URLDecoder.decode or CGIRequest.
/*
* copyright (c) 1998-2005
* Roedy Green
* Canadian Mind Products
* #327 - 964 Heywood Avenue
* Victoria, BC Canada V8V 2Y5
* tel: (250) 361-9093
* mailto:
[email protected]
*
http://mindprod.com
*/
// Version 1.0
package com.mindprod.voter;
import java.io.IOException;
import java.io.InputStream;
import java.i
utputStream;
import java.net.HttpURLConnection;
import java.net.Socket;
import java.net.URL;
/**
* simulates a browser posting a form to CGI via POST.
*/
public class CGIpost
{
/**
* Static only. Prevent instantiation.
*/
private CGIpost()
{
}
/**
* Send a formful of data to the CGI host using POST
*
* @param websiteURL URL of the website
* @param relativeURL
* relative URL of the document/CGI desired
* Absolute begin with /.
*
* @param parms parms to send, encoded with URLEncoder
*
* @return CGI host's response with headers and embedded length
fields stripped
* @exception IOException
*/
public static String post( String websiteURL, String relativeURL,
String parms ) throws IOException
{
// O P E N
URL url = new URL ( "http://" + websiteURL + '/' + relativeURL
);
HttpURLConnection urlc =
(HttpURLConnection)url.openConnection();
urlc.setAllowUserInteraction( false );
urlc.setDoInput( true );
urlc.setDoOutput( true );
urlc.setUseCaches( false );
urlc.setRequestMethod( "POST" );
// could set Referer: here
urlc.setRequestProperty( "Content-length", ( Integer.toString(
parms.length() ) ) );
urlc.connect();
InputStream is = urlc.getInputStream();
OutputStream os = urlc.getOutputStream();
// parms are the data content.
os.write( parms.getBytes( "8859_1" /* encoding */ ) );
os.close();
int statusCode;
statusCode = urlc.getResponseCode();
// get size of message. -1 means comes in an indeterminate
number of chunks.
int estimatedLength = (int)urlc.getContentLength();
if ( estimatedLength < 0 )
{
estimatedLength = 32*1024;
}
// R E A D
String result = CGIget.readEverything( is, estimatedLength );
// C L O S E
is.close();
urlc.disconnect();
return result;
} // end get
/**
* Send a formful of data to the CGI host using POST.
* @param websiteURL URL of the website
* @param relativeURL
* relative URL of the document/CGI desired
* Absolute begin with /.
*
* @return CGI host's response, including headers and length
fields.
*/
public static String rawPost( String websiteURL, String
relativeURL, String parms ) throws IOException
{
URL url = new URL( "http://" + websiteURL );
int port = url.getPort();
if ( port == -1 ) port = 80;
Socket sock = new Socket( websiteURL, port );
// Obtain data streams
OutputStream os = sock.getOutputStream();
InputStream is = sock.getInputStream();
StringBuffer sb = new StringBuffer( 1000 );
sb.append( "POST" );
sb.append( " " );
sb.append( relativeURL );
sb.append( "HTTP/1.1\n" );
sb.append( " " );
sb.append( "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0;
Windows NT 5.0) Opera 7.11 [en]\n" );
sb.append( "Host: " );
sb.append( websiteURL );
sb.append( "\n" );
sb.append( "Accept: text/html, image/png, image/jpeg, image/gif,
image/x-xbitmap, */*;q=0.1\n" );
sb.append( "Accept-Language: en\n" );
sb.append( "Accept-Charset: windows-1252, utf-8, utf-16,
iso-8859-1;q=0.6, *;q=0.1\n" );
sb.append( "Accept-Encoding: deflate, gzip, x-gzip, identity,
*;q=0\n" );
// Referer: could go here
// cookies would go here.
sb.append( "Connection: Keep-Alive, TE\n" );
sb.append( "TE: deflate, gzip, chunked, identity, trailers\n" );
sb.append( "Content-type: application/x-www-form-urlencoded\n"
);
sb.append( "Content-Length: " );
sb.append( parms.length() );
sb.append( "\n\n" );
String header = sb.toString();
os.write( header.getBytes( "8859_1" /* encoding */ ) );
os.write( parms.getBytes( "8859_1" /* encoding */ ) );
os.close();
// Read data FROM server till -1 eof
String result = CGIget.readEverything ( is, 32*1024 );
is.close();
return result;
} // end post
} // end class CGIPost
// Class com/mindprod/voter/CGIget.java
// copyright (c) 1998-2005 Roedy Green, Canadian Mind Products
// based on work by Jonathan Revusky
// To encode strings use java.net.URLEncoder.encode;
// and Java.net.URLDecoder.decode or CGIRequest.
/*
* copyright (c) 1998-2005
* Roedy Green
* Canadian Mind Products
* #327 - 964 Heywood Avenue
* Victoria, BC Canada V8V 2Y5
* tel: (250) 361-9093
* mailto:
[email protected]
*
http://mindprod.com
*/
// Version 1.0
package com.mindprod.voter;
import java.io.IOException;
import java.io.InputStream;
import java.i
utputStream;
import java.net.HttpURLConnection;
import java.net.Socket;
import java.net.URL;
/**
* simulates a browser posting a form to CGI via GET.
*/
public class CGIget
{
/**
* Static only. Prevent instantiation.
*/
private CGIget()
{
}
/**
* Send a formful of data to the CGI host using GET.
*
* @param websiteURL URL of the website
* @param relativeURL
* relative URL of the document/CGI desired
* Absolute begin with /.
*
* @param parms parms to send, encoded with URLEncoder
*
* @return CGI host's response with headers and embedded length
fields stripped
* @exception IOException
*/
public static String get( String websiteURL, String relativeURL,
String parms ) throws IOException
{
// O P E N
URL url = new URL ( "http://" + websiteURL + '/' + relativeURL +
'?' + parms );
HttpURLConnection urlc =
(HttpURLConnection)url.openConnection();
urlc.setAllowUserInteraction( false );
urlc.setDoInput( true );
urlc.setDoOutput( false );
urlc.setUseCaches( false );
urlc.setRequestMethod( "GET" );
urlc.connect();
InputStream is = urlc.getInputStream();
int statusCode;
statusCode = urlc.getResponseCode();
// get size of message. -1 means comes in an indeterminate
number of chunks.
int estimatedLength = (int)urlc.getContentLength();
if ( estimatedLength < 0 )
{
estimatedLength = 32*1024;
}
// R E A D
String result = readEverything( is, estimatedLength );
// C L O S E
is.close();
urlc.disconnect();
return result;
} // end get
/**
* Send a formful of data to the CGI host using GET.
*
* @param websiteURL URL of the website
* @param relativeURL
* relative URL of the document/CGI desired
* Absolute begin with /.
* @param parms parms to send, encoded with URLEncoder
* @return CGI host's response, raw, everything incuding headers
and embedded length fields
* @exception IOException
*/
public static String getRaw( String websiteURL, String relativeURL,
String parms ) throws IOException
{
URL url = new URL( "http://" + websiteURL );
int port = url.getPort();
if ( port == -1 ) port = 80;
Socket sock = new Socket( websiteURL, port );
// Obtain data streams
OutputStream os = sock.getOutputStream();
InputStream is = sock.getInputStream();
StringBuffer sb = new StringBuffer( 1000 );
sb.append( "GET" );
sb.append( " " );
sb.append( relativeURL );
if ( parms.length() > 0 )
{
sb.append( "?");
sb.append( parms );
}
sb.append( " " );
sb.append( "HTTP/1.1\n" );
sb.append( "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0;
Windows NT 5.0) Opera 7.11 [en]\n" );
sb.append( "Host: " );
sb.append( websiteURL );
sb.append( "\n" );
sb.append( "Accept: text/html, image/png, image/jpeg, image/gif,
image/x-xbitmap, */*;q=0.1\n" );
sb.append( "Accept-Language: en\n" );
sb.append( "Accept-Charset: windows-1252, utf-8, utf-16,
iso-8859-1;q=0.6, *;q=0.1\n" );
sb.append( "Accept-Encoding: deflate, gzip, x-gzip, identity,
*;q=0\n" );
// Referer: could go here.
// cookies could go here.
sb.append( "Connection: Keep-Alive, TE\n" );
sb.append( "TE: deflate, gzip, chunked, identity, trailers\n" );
sb.append( "Content-type: application/x-www-form-urlencoded\n"
);
sb.append( "\n" );
String header = sb.toString();
os.write( header.getBytes( "8859_1" /* encoding */ ) );
os.close();
// Read data FROM server till -1 eof
// get everything. headers, embedded length counts etc.
String result = readEverything ( is, 32*1024 );
is.close();
return result;
} // end getRaw
/**
* Used to read until EOF on an Inputstream that
* sometimes returns 0 bytes because data have
* not arrived yet. Does not close the stream.
*
* @param is InputStream to read from.
* @param estimatedLength
* Estimated number of bytes that will be read.
* -1 or 0 mean you have no idea. Best to make
* some sort of guess a little on the high side.
* @return String representing the contents of the entire
* stream.
*/
public static String readEverything( InputStream is, int
estimatedLength ) throws IOException
{
if ( estimatedLength <= 0 )
{
estimatedLength = 10*1024;
}
StringBuffer buf = new StringBuffer( estimatedLength );
int chunkSize = Math.min ( estimatedLength, 4*1024 );
byte[] ba = new byte[ chunkSize ];
// -1 means eof, 0 means none available for now.
int bytesRead;
while ( ( bytesRead = is.read( ba, 0, chunkSize )) >= 0 )
{
if ( bytesRead == 0 )
{
try
{
// no data for now
// wait a while before trying again to see if data has
arrived.
// avoid hogging cpu in a tight loop
Thread.sleep( 100 );
}
catch ( InterruptedException e )
{
Thread.currentThread().interrupt();
}
}
else
{
// got some data
buf.append( new String( ba, 0, bytesRead , "8859_1" /*
encoding */) );
}
}
return buf.toString();
} // end readEverything
/**
* Reads exactly len bytes from the input stream
* into the byte array. This method reads repeatedly from the
* underlying stream until all the bytes are read.
* InputStream.read is often documented to block like this, but in
actuality it
* does not always do so, and returns early with just a few bytes.
* readBlocking blocks until all the bytes are read,
* the end of the stream is detected,
* or an exception is thrown. You will always get as many bytes as
you
* asked for unless you get an eof or other exception.
* Unlike readFully, you find out how many bytes you did get.
*
* @param b the buffer into which the data is read.
* @param off the start offset of the data in the array,
* not offset into the file!
* @param len the number of bytes to read.
* @return number of bytes actually read.
* @exception IOException if an I/O error occurs.
*
*/
public static final int readBlocking ( InputStream in , byte b[ ],
int off, int len ) throws IOException
{
int totalBytesRead = 0;
while ( totalBytesRead < len )
{
int bytesRead = in.read( b , off + totalBytesRead , len -
totalBytesRead );
if ( bytesRead < 0 )
{
break;
}
if ( bytesRead == 0 )
{
try
{
// no data for now
// wait a while before trying again to see if data has
arrived.
// avoid hogging cpu in a tight loop
Thread.sleep( 100 );
}
catch ( InterruptedException e )
{
Thread.currentThread().interrupt();
}
}
else
{
totalBytesRead += bytesRead;
}
}
return totalBytesRead;
}
// end readBlocking
} // end class CGIget