code to clean up texts

L

lbrtchx

Hi,
~
does any one around here know of data analysis/text cleansing
libraries/code to programmatically consolidate lines in a text to
whole paragraphs?
~
does any one around here know of data analysis/text cleansing
libraries/code to programmatically consolidate lines in a text to
whole paragraphs?
~
Say you have this:
~
Four score and seven years ago
our fathers brought forth on this continent,
a new nation, conceived in Liberty,
and dedicated to the proposition
that all men are created equal.
~
to a whole paragraph
~
Four score and seven years ago our fathers brought forth on this
continent, a new nation, conceived in Liberty, and dedicated to the
proposition that all men are created equal.
~
Where can I find them?
~
Thanks
lbrtchx
 
H

Hal Rosser

Hi,
~
does any one around here know of data analysis/text cleansing
libraries/code to programmatically consolidate lines in a text to
whole paragraphs?
~
does any one around here know of data analysis/text cleansing
libraries/code to programmatically consolidate lines in a text to
whole paragraphs?
~
Say you have this:
~
Four score and seven years ago
our fathers brought forth on this continent,
a new nation, conceived in Liberty,
and dedicated to the proposition
that all men are created equal.
~
to a whole paragraph
~
Four score and seven years ago our fathers brought forth on this
continent, a new nation, conceived in Liberty, and dedicated to the
proposition that all men are created equal.
~
Where can I find them?
~
Thanks
lbrtchx

All you need to do is remove the \n's and \r's and that other character I
don't recall.
 
R

Roedy Green

does any one around here know of data analysis/text cleansing
libraries/code to programmatically consolidate lines in a text to
whole paragraphs?

here is a little utility I use called REFLOW. I have never published
it, so it may be a little crude..

// com.mindprod.reflow.Reflow.java
package com.mindprod.reflow;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Iterator;

/**
* Reflows lines into paragraph.
* with lines about the same length
* paragraphs separated by a single blank line.
*
* usage: java com.mindprod.reflow.Reflow file.txt
* copyright (c) 2003-2007 Roedy Green, Canadian Mind Products
* #101 - 2536 Wark Street
* Victoria, BC Canada V8T 4G8
* tel: (250) 361-9093
* http://mindprod.com
*
* Source and excutables may be freely used for any purpose except
military.
*/
public class Reflow
{

/**
* Max line length of output. ideally would be
* configurable.
*/
public static int LINELENGTH = 60;

private static final String EmbeddedCopyright =
"copyright (c) 2003-2007 Roedy Green, Canadian Mind Products,
http://mindprod.com";

// input "before" file
static String inFilename;
static File inFile;
static BufferedReader inReader;

// output "after" file, the temporary, later renamed to match the
input
static String outFilename;
static File outFile;
static PrintWriter outWriter;

/**
* Command line utility to reflow the text.
*/
public static void main( String[] args )
{
try
{

analyseCommandLine(args);

openInReader(); /* Open input "before" file. */
/* Make sure file exists before */
/* song and dance about extension. */

openOutWriter(); /* open output "after" file */

System.out.println("Reflowing " + inFilename );

/* copy inReader to outWriter reglowing the text */
processFiles();

/* Rename output to input */
inReader.close();
outWriter.close();
inFile.delete();
outFile.renameTo(inFile);
// don't delete outFile, it has been renamed to a real file

}
catch ( IOException e )
{
System.out.print("Oops! IO failure. e.g. out of disk space.
\n");
die();
}

} // end main

/**
* analyse the command line. It should have a filename
* case insensitive.
*/
static void analyseCommandLine(String[] args)
{
if ( args.length != 1 )
{
banner();
System.out.println("Oops! usage: com.mindprod.reflow.Reflow
Myfile.txt \n");
die();
}

inFilename = args[0]; /* file to convert */
} // end analyseCommandLine

/**
* display a banner about the author
*/
static void banner()
{
/* Usually not displayed, just embedded. */

System.out.println("\n°±²Û Reflow 1.0 Û²±°"
+ "\nFreeware to reflow text."
+ "\ncopyright (c) 2003-2007 Roedy Green,
Canadian Mind Products"
+ "\n#101 - 2536 Wark Street, Victoria, BC
Canada V8T 4G8"
+ "\nTelephone: (250) 361-9093
Internet:[email protected]"
+ "\nMay be used freely for non-military use
only\n\n");

} // end banner

/**
* open the input "before" file
*/
static void openInReader()
{
try
{
inFile = new File(inFilename);
if ( !inFile.exists() )
{
banner();
System.out.print("Oops! Cannot find file ");
System.out.println(inFilename);
die();
}
if ( !inFile.canRead() )
{
banner();
System.out.print("Oops! no permission to read (i.e.
examine) the file ");
System.out.println(inFilename);
die();
}
if ( !inFile.canWrite() )
{
banner();
System.out.print("Oops! no permission to write (i.e.
change) the file ");
System.out.println(inFilename);
die();
}

inReader = new BufferedReader(new FileReader(inFile), 4096 /*
buffsize */);
}
catch ( FileNotFoundException e )
{
banner();
System.out.print("Oops! Cannot open file ");
System.out.println(inFilename);
die();
}
} // end openInReader

/**
* open the output "after" file
*/
static void openOutWriter()
{

try
{
// get a temporary file in the same directory as inFile.
outFile = createTempFile("Reflow", "tmp", inFile);
outWriter = new PrintWriter(
new BufferedWriter(
new
FileWriter(outFile), 4096 /* buffsize */),
false /* auto flush */);
}
catch ( IOException e )
{
System.out.println("Oops! Cannot create the temporary work
file\n");
die();
}

} // end OpenOutWriter

/**
* Create a temporary file,
* Slightly smarter version of File.createTempFile
*
* @param prefix beginning letters of filename
* @param suffix ending letters of filename.
* @param near directory where to put file, or file to
* place this temp file near in the same directory.
* null means put the temp file in the
* current directory.
* @return A temporary file. It will not automatically
* delete on program completion, however.
* @exception IOException
*/
public static File createTempFile ( String prefix , String suffix ,
File near ) throws IOException {
if ( near != null )
{
if ( near.isDirectory () )
{
return File.createTempFile ( prefix, suffix, near );
}
else if ( near.isFile () )
{
String parent = near.getParent();
if ( parent != null )
{
File dir = new File( parent );
if ( dir.isDirectory () )
{
return File.createTempFile ( prefix, suffix, dir );
}
}
}
}
// anything else, just create in the current directory.
return File.createTempFile ( prefix, suffix );
}

/**
* copy inReader to outWriter, reflowing
* Presume files already open. Does not close them.
*
* @exception IOException
*/
static void processFiles() throws IOException
{

// list of words in paragraph
ArrayList words = new ArrayList(149);

// have we just seen an new line.
// blank lines separate paragraphs
boolean recentNL = false;

// the currernt word we are building up.
StringBuffer word = new StringBuffer( 50 );
try
{
charReadLoop:
while ( true )
{
int c = inReader.read();
if ( c < 0 ) break charReadLoop;
switch ( c )
{
case 160:
case ' ':
case '\t':
if ( word.length() != 0 )
{
words.add( word.toString() );
word.setLength( 0 );
}
break;

case '\n':
if ( word.length() != 0 )
{
words.add( word.toString() );
word.setLength( 0 );
}

if ( recentNL )
{
emitParagraph( words, LINELENGTH );
words = new ArrayList(149);
recentNL = false;
}
else
{
recentNL = true;
}
break;

case '\r':
/* dos has \r\n, unix just \n */
/* we just ignore them here and generate them as
needed on \n. */
break;

default:
/* ordinary non-blank char */
recentNL = false;
word.append( (char) c );
break;

} /* end switch */
} /* end while */

// dump possible last paragraph without trailing blank line.
if ( words.size() != 0 )
{
emitParagraph( words, LINELENGTH );
}
} // end try
catch ( EOFException e )
{

}
} // end processFiles

/**
* emits paragraph followed by blank line.
*
* @param words Array list of words to output
* @param maxLineLength
* maximum line length. If a word is longer
* it will not be split.
*/
static void emitParagraph ( ArrayList words, int maxLineLength )
{
/* if paragraph empty, nothing to do */
if ( words.size() == 0 )
{
return;
}
int lineLength = 0;
for ( Iterator iter = words.iterator(); iter.hasNext(); )
{
String word = (String) iter.next();
if ( lineLength + word.length() + 1 > maxLineLength )
{
// won't fit. Start a new line.
if ( lineLength != 0 )
{
outWriter.println();
lineLength = 0;
}
// no lead space
}
else
{
/* will fit */
if ( lineLength != 0 )
{
// add lead space
outWriter.print( ' ' );
lineLength++;
}
}
outWriter.print( word );
lineLength += word.length();

} // end for

outWriter.println();
outWriter.println();
}
/**
* make a noise
*/
static void honk()
{
java.awt.Toolkit.getDefaultToolkit().beep();
} // end honk

/**
* abort the run, clean up as best as possible.
*/
static void die()
{
honk();
try
{
if ( inReader != null ) inReader.close();
if ( outWriter != null ) outWriter.close();
}
catch ( IOException e )
{

}
System.exit(1); /* exit with errorlevel = 1 */
} // end die

} // end class Reflow
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,768
Messages
2,569,574
Members
45,051
Latest member
CarleyMcCr

Latest Threads

Top