I have a byte stream input (about 100MB). I need to analyze the byte stream into a big data object which contains two million data item objects (size about 50 bytes).
Each data item has members like int, short, and other objects. I have tried loop for two million times using Datainputstream
to solve this, but it takes a few seconds. Is it possible to handle it in one second?
here is the sample:`
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
class DataItem {
private Part0 member0;
private Part1 member1;
private Part3 member3;
private Part4 member4;
private int member5;
private int member6;
public void setMember0(Part0 member) {
this.member0 = member;
}
public void setMember1(Part1 member) {
this.member1 = member;
}
public void setMember3(Part3 member) {
this.member3 = member;
}
public void setMember4(Part4 member) {
this.member4 = member;
}
public void setMember5(int member) {
this.member5 = member;
}
public void setMember6(int member) {
this.member6 = member;
}
}
class Part0 {
Part2 member1;
String member2;
public void setMember1(Part2 member) {
this.member1 = member;
}
public void setMember2(String member) {
this.member2 = member;
}
}
class Part1 {
short member1;
byte member2;
byte member3;
byte member4;
byte member5;
byte member6;
byte member7;
public void setMember5(byte member) {
this.member5 = member;
}
public void setMember6(byte member) {
this.member6 = member;
}
public void setMember7(byte member) {
this.member7 = member;
}
public void setMember1(short member) {
this.member1 = member;
}
public void setMember2(byte member) {
this.member2 = member;
}
public void setMember3(byte member) {
this.member3 = member;
}
public void setMember4(byte member) {
this.member4 = member;
}
}
class Part2 {
short member1;
short member2;
int member3;
byte member4;
byte member5;
short member6;
public void setMember1(short member) {
this.member1 = member;
}
public void setMember2(short member) {
this.member2 = member;
}
public void setMember3(int member) {
this.member3 = member;
}
public void setMember4(byte member) {
this.member4 = member;
}
public void setMember5(byte member) {
this.member5 = member;
}
public void setMember6(short member) {
this.member6 = member;
}
}
class Part3 {
short member1;
short member2;
public void setMember1(short member) {
this.member1 = member;
}
public void setMember2(short member) {
this.member2 = member;
}
}
class Part4 {
int member1;
short member2;
short member3;
public void setMember1(int member) {
this.member1 = member;
}
public void setzMember2(short member) {
this.member2 = member;
}
public void setMember3(short member) {
this.member3 = member;
}
}
public class testForHugeData {
public static void main(String[]args) throws IOException {
int runtimes = 2000000;
createFile();
ByteArrayOutputStream bos = new ByteArrayOutputStream();
FileInputStream rd = new FileInputStream("test.txt");
BufferedInputStream ws = new BufferedInputStream(rd);
byte []buffer = new byte[1024];
int len;
while((len = ws.read(buffer,0,1024))!=-1) {
bos.write(buffer,0,len);
}
byte[] arr = bos.toByteArray();
System.out.println("a input byteStream sized "+arr.length +" is created");
ByteArrayInputStream bs = new ByteArrayInputStream(arr);
// create a datainputStream
DataInputStream ds = new DataInputStream(bs);
// create a bufferedInputStream
BufferedInputStream fs = new BufferedInputStream(ds);
runTaskForManyTimes(runtimes,ds,fs);
}
private static void runTaskForManyTimes(int runtimes, DataInputStream ds, BufferedInputStream fs) throws IOException {
HageData hugeData = new HageData();
long start = System.currentTimeMillis();
for(int i= 0;i<runtimes;i++) {
hugeData.addDataItems(taskUseDataInputStream(runtimes,ds));
}
System.out.println("use dataIuputStream to analyze byte stream:");
System.out.println(" it takes "+(System.currentTimeMillis()-start)+"ms to loop 2 million times");
HageData hugeData1 = new HageData();
start = System.currentTimeMillis();
for(int i= 0;i<runtimes;i ++) {
hugeData1.addDataItems(taskUseBufferedInputStream(runtimes,fs));
}
System.out.println("use bufferedIuputStream to analyze byte stream:");
System.out.println(" it takes "+(System.currentTimeMillis()-start)+"ms to loop 2 million times");
}
private static DataItem taskUseDataInputStream(int runtimes, DataInputStream ds) throws IOException {
DataItem item = new DataItem();
Part1 part1 = new Part1();
part1.setMember1(ds.readShort());
part1.setMember2(ds.readByte());
part1.setMember3(ds.readByte());
part1.setMember4(ds.readByte());
part1.setMember5(ds.readByte());
part1.setMember6(ds.readByte());
part1.setMember7(ds.readByte());
item.setMember1(part1);
Part0 part0 = new Part0();
Part2 part2 = new Part2();
part2.setMember1(ds.readShort());
part2.setMember3(ds.readInt());
part2.setMember5(ds.readByte());
part2.setMember2(ds.readShort());
part2.setMember6(ds.readShort());
part2.setMember4(ds.readByte());
byte[] tmp = new byte[10];
for(int i = 0; i< 10; i++) {
tmp[i] = ds.readByte();
}
part0.setMember1(part2);
part0.setMember2(new String(tmp));
item.setMember0(part0);
Part3 part3 = new Part3();
part3.setMember1(ds.readShort());
part3.setMember2(ds.readShort());
item.setMember3(part3);
Part4 part4 = new Part4();
part4.setMember1(ds.readInt());
part4.setzMember2(ds.readShort());
part4.setMember3(ds.readShort());
item.setMember4(part4);
item.setMember5(ds.readInt());
item.setMember6(ds.readInt());
return item;
}
private static DataItem taskUseBufferedInputStream(int runtimes, BufferedInputStream fs) throws IOException {
DataItem item = new DataItem();
Part1 part1 = new Part1();
part1.setMember1(readShort(fs));
part1.setMember2((byte)fs.read());
part1.setMember3((byte)fs.read());
part1.setMember4((byte)fs.read());
part1.setMember5((byte)fs.read());
part1.setMember6((byte)fs.read());
part1.setMember7((byte)fs.read());
item.setMember1(part1);
Part0 part0 = new Part0();
Part2 part2 = new Part2();
part2.setMember1(readShort(fs));
part2.setMember3(readInt(fs));
part2.setMember5((byte)fs.read());
part2.setMember2(readShort(fs));
part2.setMember6(readShort(fs));
part2.setMember4((byte)fs.read());
byte[] tmp = new byte[10];
for(int i = 0; i< 10; i++) {
tmp[i] = (byte)fs.read();
}
part0.setMember1(part2);
part0.setMember2(new String(tmp));
item.setMember0(part0);
Part3 part3 = new Part3();
part3.setMember1(readShort(fs));
part3.setMember2(readShort(fs));
item.setMember3(part3);
Part4 part4 = new Part4();
part4.setMember1(readInt(fs));
part4.setzMember2(readShort(fs));
part4.setMember3(readShort(fs));
item.setMember4(part4);
item.setMember5(readInt(fs));
item.setMember6(readInt(fs));
return item;
}
private static short readShort(BufferedInputStream fs) throws IOException {
// created to read short from BufferedInputStream
byte [] tmp = new byte[2];
tmp[0] = (byte)fs.read();
tmp[1] = (byte)fs.read();
return (short)(tmp[0]<<8|tmp[1]);
}
private static int readInt(BufferedInputStream fs) throws IOException {
// created to read int from BufferedInputStream
byte [] tmp = new byte[4];
tmp[0] = (byte)fs.read();
tmp[1] = (byte)fs.read();
tmp[2] = (byte)fs.read();
tmp[3] = (byte)fs.read();
return (int)(tmp[0]<<24|tmp[1]<<16|tmp[2]<<8|tmp[3]);
}
private static void createFile() throws IOException {
File file = new File("test.txt");
if(!file.exists()) {
file.createNewFile();
}
// so we create a random file sized 100,000,000 for test
RandomAccessFile file1 = new RandomAccessFile(file, "rw");
file1.setLength(100000000); //you can change size here
file1.close();
}
}
here is the result: `
a input byteStream sized 100000000 is created
use dataIuputStream to analyze byte stream:
it takes 4489ms to loop 2 million times
use bufferedIuputStream to analyze byte stream:
it takes 4686ms to loop 2 million times
so it seems like bufferedStream is slower? But when I changed the size of input byte stream to 400M(by changing test file size to 400M) the result comes to:`
a input byteStream sized 400000000 is created
use dataIuputStream to analyze byte stream:
it takes 4740ms to loop 2 million times
use bufferedIuputStream to analyze byte stream:
it takes 1384ms to loop 2 million times
So it seems like the performance of bufferedInputstream depends on the buffersize. Anyway the time cost is too high.