code to clean up texts

Discussion in 'Java' started by lbrtchx@hotmail.com, Jun 4, 2007.

  1. Guest

    Hi,
    ~
    does any one around here know of data analysis/text cleansing
    libraries/code to programmatically consolidate lines in a text to
    whole paragraphs?
    ~
    does any one around here know of data analysis/text cleansing
    libraries/code to programmatically consolidate lines in a text to
    whole paragraphs?
    ~
    Say you have this:
    ~
    Four score and seven years ago
    our fathers brought forth on this continent,
    a new nation, conceived in Liberty,
    and dedicated to the proposition
    that all men are created equal.
    ~
    to a whole paragraph
    ~
    Four score and seven years ago our fathers brought forth on this
    continent, a new nation, conceived in Liberty, and dedicated to the
    proposition that all men are created equal.
    ~
    Where can I find them?
    ~
    Thanks
    lbrtchx
     
    , Jun 4, 2007
    #1
    1. Advertising

  2. Hal Rosser Guest

    <> wrote in message
    news:...
    > Hi,
    > ~
    > does any one around here know of data analysis/text cleansing
    > libraries/code to programmatically consolidate lines in a text to
    > whole paragraphs?
    > ~
    > does any one around here know of data analysis/text cleansing
    > libraries/code to programmatically consolidate lines in a text to
    > whole paragraphs?
    > ~
    > Say you have this:
    > ~
    > Four score and seven years ago
    > our fathers brought forth on this continent,
    > a new nation, conceived in Liberty,
    > and dedicated to the proposition
    > that all men are created equal.
    > ~
    > to a whole paragraph
    > ~
    > Four score and seven years ago our fathers brought forth on this
    > continent, a new nation, conceived in Liberty, and dedicated to the
    > proposition that all men are created equal.
    > ~
    > Where can I find them?
    > ~
    > Thanks
    > lbrtchx


    All you need to do is remove the \n's and \r's and that other character I
    don't recall.
     
    Hal Rosser, Jun 5, 2007
    #2
    1. Advertising

  3. Roedy Green Guest

    On Mon, 04 Jun 2007 12:43:47 -0700, wrote, quoted
    or indirectly quoted someone who said :

    > does any one around here know of data analysis/text cleansing
    >libraries/code to programmatically consolidate lines in a text to
    >whole paragraphs?


    here is a little utility I use called REFLOW. I have never published
    it, so it may be a little crude..

    // com.mindprod.reflow.Reflow.java
    package com.mindprod.reflow;

    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.EOFException;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.io.PrintWriter;
    import java.util.ArrayList;
    import java.util.Iterator;

    /**
    * Reflows lines into paragraph.
    * with lines about the same length
    * paragraphs separated by a single blank line.
    *
    * usage: java com.mindprod.reflow.Reflow file.txt
    * copyright (c) 2003-2007 Roedy Green, Canadian Mind Products
    * #101 - 2536 Wark Street
    * Victoria, BC Canada V8T 4G8
    * tel: (250) 361-9093
    * http://mindprod.com
    *
    * Source and excutables may be freely used for any purpose except
    military.
    */
    public class Reflow
    {

    /**
    * Max line length of output. ideally would be
    * configurable.
    */
    public static int LINELENGTH = 60;

    private static final String EmbeddedCopyright =
    "copyright (c) 2003-2007 Roedy Green, Canadian Mind Products,
    http://mindprod.com";

    // input "before" file
    static String inFilename;
    static File inFile;
    static BufferedReader inReader;

    // output "after" file, the temporary, later renamed to match the
    input
    static String outFilename;
    static File outFile;
    static PrintWriter outWriter;

    /**
    * Command line utility to reflow the text.
    */
    public static void main( String[] args )
    {
    try
    {

    analyseCommandLine(args);

    openInReader(); /* Open input "before" file. */
    /* Make sure file exists before */
    /* song and dance about extension. */

    openOutWriter(); /* open output "after" file */

    System.out.println("Reflowing " + inFilename );

    /* copy inReader to outWriter reglowing the text */
    processFiles();

    /* Rename output to input */
    inReader.close();
    outWriter.close();
    inFile.delete();
    outFile.renameTo(inFile);
    // don't delete outFile, it has been renamed to a real file

    }
    catch ( IOException e )
    {
    System.out.print("Oops! IO failure. e.g. out of disk space.
    \n");
    die();
    }

    } // end main

    /**
    * analyse the command line. It should have a filename
    * case insensitive.
    */
    static void analyseCommandLine(String[] args)
    {
    if ( args.length != 1 )
    {
    banner();
    System.out.println("Oops! usage: com.mindprod.reflow.Reflow
    Myfile.txt \n");
    die();
    }

    inFilename = args[0]; /* file to convert */
    } // end analyseCommandLine

    /**
    * display a banner about the author
    */
    static void banner()
    {
    /* Usually not displayed, just embedded. */

    System.out.println("\n°±²Û Reflow 1.0 Û²±°"
    + "\nFreeware to reflow text."
    + "\ncopyright (c) 2003-2007 Roedy Green,
    Canadian Mind Products"
    + "\n#101 - 2536 Wark Street, Victoria, BC
    Canada V8T 4G8"
    + "\nTelephone: (250) 361-9093
    Internet:"
    + "\nMay be used freely for non-military use
    only\n\n");

    } // end banner

    /**
    * open the input "before" file
    */
    static void openInReader()
    {
    try
    {
    inFile = new File(inFilename);
    if ( !inFile.exists() )
    {
    banner();
    System.out.print("Oops! Cannot find file ");
    System.out.println(inFilename);
    die();
    }
    if ( !inFile.canRead() )
    {
    banner();
    System.out.print("Oops! no permission to read (i.e.
    examine) the file ");
    System.out.println(inFilename);
    die();
    }
    if ( !inFile.canWrite() )
    {
    banner();
    System.out.print("Oops! no permission to write (i.e.
    change) the file ");
    System.out.println(inFilename);
    die();
    }

    inReader = new BufferedReader(new FileReader(inFile), 4096 /*
    buffsize */);
    }
    catch ( FileNotFoundException e )
    {
    banner();
    System.out.print("Oops! Cannot open file ");
    System.out.println(inFilename);
    die();
    }
    } // end openInReader

    /**
    * open the output "after" file
    */
    static void openOutWriter()
    {

    try
    {
    // get a temporary file in the same directory as inFile.
    outFile = createTempFile("Reflow", "tmp", inFile);
    outWriter = new PrintWriter(
    new BufferedWriter(
    new
    FileWriter(outFile), 4096 /* buffsize */),
    false /* auto flush */);
    }
    catch ( IOException e )
    {
    System.out.println("Oops! Cannot create the temporary work
    file\n");
    die();
    }

    } // end OpenOutWriter

    /**
    * Create a temporary file,
    * Slightly smarter version of File.createTempFile
    *
    * @param prefix beginning letters of filename
    * @param suffix ending letters of filename.
    * @param near directory where to put file, or file to
    * place this temp file near in the same directory.
    * null means put the temp file in the
    * current directory.
    * @return A temporary file. It will not automatically
    * delete on program completion, however.
    * @exception IOException
    */
    public static File createTempFile ( String prefix , String suffix ,
    File near ) throws IOException {
    if ( near != null )
    {
    if ( near.isDirectory () )
    {
    return File.createTempFile ( prefix, suffix, near );
    }
    else if ( near.isFile () )
    {
    String parent = near.getParent();
    if ( parent != null )
    {
    File dir = new File( parent );
    if ( dir.isDirectory () )
    {
    return File.createTempFile ( prefix, suffix, dir );
    }
    }
    }
    }
    // anything else, just create in the current directory.
    return File.createTempFile ( prefix, suffix );
    }

    /**
    * copy inReader to outWriter, reflowing
    * Presume files already open. Does not close them.
    *
    * @exception IOException
    */
    static void processFiles() throws IOException
    {

    // list of words in paragraph
    ArrayList words = new ArrayList(149);

    // have we just seen an new line.
    // blank lines separate paragraphs
    boolean recentNL = false;

    // the currernt word we are building up.
    StringBuffer word = new StringBuffer( 50 );
    try
    {
    charReadLoop:
    while ( true )
    {
    int c = inReader.read();
    if ( c < 0 ) break charReadLoop;
    switch ( c )
    {
    case 160:
    case ' ':
    case '\t':
    if ( word.length() != 0 )
    {
    words.add( word.toString() );
    word.setLength( 0 );
    }
    break;

    case '\n':
    if ( word.length() != 0 )
    {
    words.add( word.toString() );
    word.setLength( 0 );
    }

    if ( recentNL )
    {
    emitParagraph( words, LINELENGTH );
    words = new ArrayList(149);
    recentNL = false;
    }
    else
    {
    recentNL = true;
    }
    break;

    case '\r':
    /* dos has \r\n, unix just \n */
    /* we just ignore them here and generate them as
    needed on \n. */
    break;

    default:
    /* ordinary non-blank char */
    recentNL = false;
    word.append( (char) c );
    break;

    } /* end switch */
    } /* end while */

    // dump possible last paragraph without trailing blank line.
    if ( words.size() != 0 )
    {
    emitParagraph( words, LINELENGTH );
    }
    } // end try
    catch ( EOFException e )
    {

    }
    } // end processFiles

    /**
    * emits paragraph followed by blank line.
    *
    * @param words Array list of words to output
    * @param maxLineLength
    * maximum line length. If a word is longer
    * it will not be split.
    */
    static void emitParagraph ( ArrayList words, int maxLineLength )
    {
    /* if paragraph empty, nothing to do */
    if ( words.size() == 0 )
    {
    return;
    }
    int lineLength = 0;
    for ( Iterator iter = words.iterator(); iter.hasNext(); )
    {
    String word = (String) iter.next();
    if ( lineLength + word.length() + 1 > maxLineLength )
    {
    // won't fit. Start a new line.
    if ( lineLength != 0 )
    {
    outWriter.println();
    lineLength = 0;
    }
    // no lead space
    }
    else
    {
    /* will fit */
    if ( lineLength != 0 )
    {
    // add lead space
    outWriter.print( ' ' );
    lineLength++;
    }
    }
    outWriter.print( word );
    lineLength += word.length();

    } // end for

    outWriter.println();
    outWriter.println();
    }
    /**
    * make a noise
    */
    static void honk()
    {
    java.awt.Toolkit.getDefaultToolkit().beep();
    } // end honk

    /**
    * abort the run, clean up as best as possible.
    */
    static void die()
    {
    honk();
    try
    {
    if ( inReader != null ) inReader.close();
    if ( outWriter != null ) outWriter.close();
    }
    catch ( IOException e )
    {

    }
    System.exit(1); /* exit with errorlevel = 1 */
    } // end die

    } // end class Reflow

    --
    Roedy Green Canadian Mind Products
    The Java Glossary
    http://mindprod.com
     
    Roedy Green, Jun 5, 2007
    #3
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. HDL Book Seller
    Replies:
    0
    Views:
    553
    HDL Book Seller
    May 27, 2004
  2. HDL Book Seller
    Replies:
    0
    Views:
    458
    HDL Book Seller
    Aug 6, 2004
  3. HDL Book Seller

    Verilog & VHDL reference texts

    HDL Book Seller, Aug 31, 2004, in forum: VHDL
    Replies:
    0
    Views:
    550
    HDL Book Seller
    Aug 31, 2004
  4. HDL Book Seller
    Replies:
    0
    Views:
    1,015
    HDL Book Seller
    Dec 1, 2004
  5. Replies:
    8
    Views:
    544
Loading...

Share This Page