Q
qwertmonkey
From: "qwertmonkey" <qwertmonkey@1:261/38.remove-dpk-this>
From: (e-mail address removed)
~
I don't think this would make sense. All sentences are short and all I
need to do is basically scan them and use look-up tables to do some tinkering
with the code points. The scheduling of threads and constant context switching
will most probably make things slower ~
OK this is the piece of the code I am trying to optimize and the results
I get, using a large enough file with sentences: ~
http://corpora.informatik.uni-leipzig.de/download.html
~
http://corpora.uni-leipzig.de/downloads/deu_news_2008_10M-text.tar.gz
~
inside of the tar ball there is a file with just sentences:
~
$ ls -l deu_news_2008_10M-sentences.txt
-rw-r--r-- 1 knoppix knoppix 1235804164 May 28 2011
deu_news_2008_10M-sentences.txt
$ md5sum -b deu_news_2008_10M-sentences.txt
23041587b6414d1a1a56c9c389d3c18f *deu_news_2008_10M-sentences.txt
$ wc -l deu_news_2008_10M-sentences.txt
10000000 deu_news_2008_10M-sentences.txt ~
Again, do you know of any faster way to go about reading the sentences of
such large files and getting their code points?
lbrtchx
~
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.nio.file.Files;
import java.nio.charset.Charset;
import java.io.BufferedReader;
import java.io.IOException;
// __
public class NIO2_newBufferedReader02Test{
private static final String aNWLn = System.getProperty("line.separator");
// __
public static void main(String[] aArgs){
if((aArgs != null) && (aArgs.length == 1)){
long lTm00 = System.currentTimeMillis();
long lLns = 0;
int iTtlRdKdPnts = 0;
BufferedReader BfR = null;
Path IFlPth = FileSystems.getDefault().getPath(aArgs[0]);
long lIFlL = IFlPth.toFile().length();
int iKdPnt, iSxL;
StringBuilder aBldr = new StringBuilder(1024);
// __
try{
BfR = Files.newBufferedReader(IFlPth, Charset.forName("UTF-8"));
String aSx = BfR.readLine();
while(aSx != null){
iSxL = aSx.length();
if(iSxL > 0){
for(int i = 0; (i < iSxL); ++i){
iKdPnt = aSx.codePointAt(i); ++iTtlRdKdPnts;
aBldr.appendCodePoint(iKdPnt);
}
// __
aBldr.delete(0, aBldr.length());
}// (iSxL > 0)
++lLns;
aSx = BfR.readLine();
}// (aSx != null)
BfR.close();
// __
System.err.println("// __ reading |" + lIFlL + "| bytes long text file
with |" + lLns + "| lines took |" + (System.currentTimeMillis() - lTm00) + "|
(ms)");
System.err.println("// __ iTtlRdKdPnts: |" + iTtlRdKdPnts + "|");
}catch(IOException IOX) { IOX.printStackTrace(System.err); }
}
else{ System.err.println("// __ usage:" + aNWLn + aNWLn +
" java NIO2_newBufferedReader02Test \"<text file>\"" + aNWLn); }
}
}
~
$ java -version
java version "1.7.0_02"
Java(TM) SE Runtime Environment (build 1.7.0_02-b13) Java HotSpot(TM) Server VM
(build 22.0-b10, mixed mode) ~
$ free
total used free shared buffers cached
Mem: 4051236 719224 3332012 0 22008 408260
-/+ buffers/cache: 288956 3762280
Swap: 3038424 0 3038424
~
$ javac -encoding utf8 NIO2_newBufferedReader02Test.java
~
$ date; java -Xms256m -Xmx1024m -Xincgc -Dfile.encoding=utf8
NIO2_newBufferedReader02Test /media/sdb1/tmp/eng_news_2006_10M-sentences.txt;
date;
~
Tue Jul 31 02:05:04 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|41922| (ms)
Tue Jul 31 02:05:46 UTC 2012
~
Tue Jul 31 02:05:51 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|27299| (ms)
Tue Jul 31 02:06:19 UTC 2012
~
Tue Jul 31 02:06:22 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|28180| (ms)
Tue Jul 31 02:06:50 UTC 2012
~
Tue Jul 31 02:26:43 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|35388| (ms)
Tue Jul 31 02:27:18 UTC 2012
~
Tue Jul 31 02:27:21 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|38155| (ms)
Tue Jul 31 02:28:00 UTC 2012
~
Tue Jul 31 02:30:40 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|41099| (ms)
Tue Jul 31 02:31:21 UTC 2012
-+- BBBS/Li6 v4.10 Dada-1
+ Origin: Prism bbs (1:261/38)
-+- Synchronet 3.16a-Win32 NewsLink 1.98
Time Warp of the Future BBS - telnet://time.synchro.net:24
--- BBBS/Li6 v4.10 Dada-1
* Origin: Prism bbs (1:261/38)
--- Synchronet 3.16a-Win32 NewsLink 1.98
Time Warp of the Future BBS - telnet://time.synchro.net:24
From: (e-mail address removed)
~
to one of the iPrx-1 other threads that might run on separate processors? ~How slow is the NL processing? ~
Does it make any sense to read lines in one thread and pass each off
I don't think this would make sense. All sentences are short and all I
need to do is basically scan them and use look-up tables to do some tinkering
with the code points. The scheduling of threads and constant context switching
will most probably make things slower ~
OK this is the piece of the code I am trying to optimize and the results
I get, using a large enough file with sentences: ~
http://corpora.informatik.uni-leipzig.de/download.html
~
http://corpora.uni-leipzig.de/downloads/deu_news_2008_10M-text.tar.gz
~
inside of the tar ball there is a file with just sentences:
~
$ ls -l deu_news_2008_10M-sentences.txt
-rw-r--r-- 1 knoppix knoppix 1235804164 May 28 2011
deu_news_2008_10M-sentences.txt
$ md5sum -b deu_news_2008_10M-sentences.txt
23041587b6414d1a1a56c9c389d3c18f *deu_news_2008_10M-sentences.txt
$ wc -l deu_news_2008_10M-sentences.txt
10000000 deu_news_2008_10M-sentences.txt ~
Again, do you know of any faster way to go about reading the sentences of
such large files and getting their code points?
lbrtchx
~
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.nio.file.Files;
import java.nio.charset.Charset;
import java.io.BufferedReader;
import java.io.IOException;
// __
public class NIO2_newBufferedReader02Test{
private static final String aNWLn = System.getProperty("line.separator");
// __
public static void main(String[] aArgs){
if((aArgs != null) && (aArgs.length == 1)){
long lTm00 = System.currentTimeMillis();
long lLns = 0;
int iTtlRdKdPnts = 0;
BufferedReader BfR = null;
Path IFlPth = FileSystems.getDefault().getPath(aArgs[0]);
long lIFlL = IFlPth.toFile().length();
int iKdPnt, iSxL;
StringBuilder aBldr = new StringBuilder(1024);
// __
try{
BfR = Files.newBufferedReader(IFlPth, Charset.forName("UTF-8"));
String aSx = BfR.readLine();
while(aSx != null){
iSxL = aSx.length();
if(iSxL > 0){
for(int i = 0; (i < iSxL); ++i){
iKdPnt = aSx.codePointAt(i); ++iTtlRdKdPnts;
aBldr.appendCodePoint(iKdPnt);
}
// __
aBldr.delete(0, aBldr.length());
}// (iSxL > 0)
++lLns;
aSx = BfR.readLine();
}// (aSx != null)
BfR.close();
// __
System.err.println("// __ reading |" + lIFlL + "| bytes long text file
with |" + lLns + "| lines took |" + (System.currentTimeMillis() - lTm00) + "|
(ms)");
System.err.println("// __ iTtlRdKdPnts: |" + iTtlRdKdPnts + "|");
}catch(IOException IOX) { IOX.printStackTrace(System.err); }
}
else{ System.err.println("// __ usage:" + aNWLn + aNWLn +
" java NIO2_newBufferedReader02Test \"<text file>\"" + aNWLn); }
}
}
~
$ java -version
java version "1.7.0_02"
Java(TM) SE Runtime Environment (build 1.7.0_02-b13) Java HotSpot(TM) Server VM
(build 22.0-b10, mixed mode) ~
$ free
total used free shared buffers cached
Mem: 4051236 719224 3332012 0 22008 408260
-/+ buffers/cache: 288956 3762280
Swap: 3038424 0 3038424
~
$ javac -encoding utf8 NIO2_newBufferedReader02Test.java
~
$ date; java -Xms256m -Xmx1024m -Xincgc -Dfile.encoding=utf8
NIO2_newBufferedReader02Test /media/sdb1/tmp/eng_news_2006_10M-sentences.txt;
date;
~
Tue Jul 31 02:05:04 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|41922| (ms)
Tue Jul 31 02:05:46 UTC 2012
~
Tue Jul 31 02:05:51 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|27299| (ms)
Tue Jul 31 02:06:19 UTC 2012
~
Tue Jul 31 02:06:22 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|28180| (ms)
Tue Jul 31 02:06:50 UTC 2012
~
Tue Jul 31 02:26:43 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|35388| (ms)
Tue Jul 31 02:27:18 UTC 2012
~
Tue Jul 31 02:27:21 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|38155| (ms)
Tue Jul 31 02:28:00 UTC 2012
~
Tue Jul 31 02:30:40 UTC 2012
// __ reading |1280939143| bytes long text file with |10000000| lines took
|41099| (ms)
Tue Jul 31 02:31:21 UTC 2012
-+- BBBS/Li6 v4.10 Dada-1
+ Origin: Prism bbs (1:261/38)
-+- Synchronet 3.16a-Win32 NewsLink 1.98
Time Warp of the Future BBS - telnet://time.synchro.net:24
--- BBBS/Li6 v4.10 Dada-1
* Origin: Prism bbs (1:261/38)
--- Synchronet 3.16a-Win32 NewsLink 1.98
Time Warp of the Future BBS - telnet://time.synchro.net:24