0

I have a byte stream input (about 100MB). I need to analyze the byte stream into a big data object which contains two million data item objects (size about 50 bytes).

Each data item has members like int, short, and other objects. I have tried loop for two million times using Datainputstream to solve this, but it takes a few seconds. Is it possible to handle it in one second? here is the sample:`

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
class DataItem {
    private Part0 member0;
    private Part1 member1;
    private Part3 member3;
    private Part4 member4;
    private int member5;
    private int member6;
    public void setMember0(Part0 member) {
        this.member0 = member;
    }
    public void setMember1(Part1 member) {
        this.member1 = member;
    }
    public void setMember3(Part3 member) {
        this.member3 = member;
    }
    public void setMember4(Part4 member) {
        this.member4 = member;
    }
    public void setMember5(int member) {
        this.member5 = member;
    }
    public void setMember6(int member) {
        this.member6 = member;
    }
}

class Part0 {
    Part2 member1;
    String member2;
    public void setMember1(Part2 member) {
        this.member1 = member;
    }
    public void setMember2(String member) {
        this.member2 = member;
    }

}
class Part1 {
    short member1;
    byte  member2;
    byte  member3;
    byte  member4;
    byte  member5;
    byte  member6;
    byte  member7;
    public void setMember5(byte member) {
        this.member5 = member;
    }
    public void setMember6(byte member) {
        this.member6 = member;
    }
    public void setMember7(byte member) {
        this.member7 = member;
    }
    public void setMember1(short member) {
        this.member1 = member;
    }
    public void setMember2(byte member) {
        this.member2 = member;
    }
    public void setMember3(byte member) {
        this.member3 = member;
    }
    public void setMember4(byte member) {
        this.member4 = member;
    }
}
class Part2 {
    short member1;
    short member2;
    int member3;
    byte member4;
    byte member5;
    short member6;
    public void setMember1(short member) {
        this.member1 = member;
    }
    public void setMember2(short member) {
        this.member2 = member;
    }
    public void setMember3(int member) {
        this.member3 = member;
    }
    public void setMember4(byte member) {
        this.member4 = member;
    }
    public void setMember5(byte member) {
        this.member5 = member;
    }
    public void setMember6(short member) {
        this.member6 = member;
    }
}
class Part3 {
    short member1;
    short member2;
    public void setMember1(short member) {
        this.member1 = member;
    }
    public void setMember2(short member) {
        this.member2 = member;
    }
}
class Part4 {
    int member1;
    short member2;
    short member3;
    public void setMember1(int member) {
        this.member1 = member;
    }
    public void setzMember2(short member) {
        this.member2 = member;
    }
    public void setMember3(short member) {
        this.member3 = member;
    }
}
public class testForHugeData {

    public static void main(String[]args) throws IOException {
        int runtimes = 2000000;
        createFile();
        
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        FileInputStream rd = new FileInputStream("test.txt");
        BufferedInputStream ws = new BufferedInputStream(rd);
        byte []buffer = new byte[1024];
        int len;
        while((len = ws.read(buffer,0,1024))!=-1) {
            bos.write(buffer,0,len);
        }
        byte[] arr = bos.toByteArray();
        System.out.println("a input byteStream sized "+arr.length +" is created");
        
        ByteArrayInputStream bs = new ByteArrayInputStream(arr);
        
        // create a datainputStream
        DataInputStream ds = new DataInputStream(bs);
        // create a bufferedInputStream
        BufferedInputStream fs = new BufferedInputStream(ds);
        
        runTaskForManyTimes(runtimes,ds,fs);
    }

    private static void runTaskForManyTimes(int runtimes, DataInputStream ds, BufferedInputStream fs) throws IOException {

        HageData hugeData = new HageData();
        long start = System.currentTimeMillis();
        for(int i= 0;i<runtimes;i++) {
            hugeData.addDataItems(taskUseDataInputStream(runtimes,ds));
        }
        System.out.println("use dataIuputStream to analyze byte stream:");
        System.out.println("  it takes "+(System.currentTimeMillis()-start)+"ms to loop 2 million times");
        
        HageData hugeData1 = new HageData();
        start = System.currentTimeMillis();
        for(int i= 0;i<runtimes;i ++) {
            hugeData1.addDataItems(taskUseBufferedInputStream(runtimes,fs));
        }
        System.out.println("use bufferedIuputStream to analyze byte stream:");
        System.out.println("  it takes "+(System.currentTimeMillis()-start)+"ms to loop 2 million times");
    }
    private static DataItem taskUseDataInputStream(int runtimes, DataInputStream ds) throws IOException {
        DataItem item = new DataItem();
        Part1 part1 = new Part1();
        part1.setMember1(ds.readShort());
        part1.setMember2(ds.readByte());
        part1.setMember3(ds.readByte());
        part1.setMember4(ds.readByte());
        part1.setMember5(ds.readByte());
        part1.setMember6(ds.readByte());
        part1.setMember7(ds.readByte());
        item.setMember1(part1);
        Part0 part0 = new Part0();
        Part2 part2 = new Part2();
        part2.setMember1(ds.readShort());
        part2.setMember3(ds.readInt());
        part2.setMember5(ds.readByte());
        part2.setMember2(ds.readShort());
        part2.setMember6(ds.readShort());
        part2.setMember4(ds.readByte());
        byte[] tmp = new byte[10];
        for(int i = 0; i< 10; i++) {
            tmp[i] = ds.readByte();
        }
        part0.setMember1(part2);
        part0.setMember2(new String(tmp));
        item.setMember0(part0);
        Part3 part3 = new Part3();
        part3.setMember1(ds.readShort());
        part3.setMember2(ds.readShort());
        item.setMember3(part3);
        Part4 part4 = new Part4();
        part4.setMember1(ds.readInt());
        part4.setzMember2(ds.readShort());
        part4.setMember3(ds.readShort());
        item.setMember4(part4);
        item.setMember5(ds.readInt());
        item.setMember6(ds.readInt());
        
        return item;
    }
    private static DataItem taskUseBufferedInputStream(int runtimes, BufferedInputStream fs) throws IOException {
        DataItem item = new DataItem();
        Part1 part1 = new Part1();
        part1.setMember1(readShort(fs));
        part1.setMember2((byte)fs.read());
        part1.setMember3((byte)fs.read());
        part1.setMember4((byte)fs.read());
        part1.setMember5((byte)fs.read());
        part1.setMember6((byte)fs.read());
        part1.setMember7((byte)fs.read());
        item.setMember1(part1);
        Part0 part0 = new Part0();
        Part2 part2 = new Part2();
        part2.setMember1(readShort(fs));
        part2.setMember3(readInt(fs));
        part2.setMember5((byte)fs.read());
        part2.setMember2(readShort(fs));
        part2.setMember6(readShort(fs));
        part2.setMember4((byte)fs.read());
        byte[] tmp = new byte[10];
        for(int i = 0; i< 10; i++) {
            tmp[i] = (byte)fs.read();
        }
        part0.setMember1(part2);
        part0.setMember2(new String(tmp));
        item.setMember0(part0);
        Part3 part3 = new Part3();
        part3.setMember1(readShort(fs));
        part3.setMember2(readShort(fs));
        item.setMember3(part3);
        Part4 part4 = new Part4();
        part4.setMember1(readInt(fs));
        part4.setzMember2(readShort(fs));
        part4.setMember3(readShort(fs));
        item.setMember4(part4);
        item.setMember5(readInt(fs));
        item.setMember6(readInt(fs));
        
        return item;
    }
    private static short readShort(BufferedInputStream fs) throws IOException {
        // created to read short from BufferedInputStream
        byte [] tmp = new byte[2];
        tmp[0] = (byte)fs.read();
        tmp[1] = (byte)fs.read();
        
        return (short)(tmp[0]<<8|tmp[1]);
    }
    private static int readInt(BufferedInputStream fs) throws IOException {
        // created to read int from BufferedInputStream
        byte [] tmp = new byte[4];
        tmp[0] = (byte)fs.read();
        tmp[1] = (byte)fs.read();
        tmp[2] = (byte)fs.read();
        tmp[3] = (byte)fs.read();
        return (int)(tmp[0]<<24|tmp[1]<<16|tmp[2]<<8|tmp[3]);
    }
    
    private static void createFile() throws IOException {
        File file = new File("test.txt");
        if(!file.exists()) {
            file.createNewFile();
        }
        // so we create a random file sized 100,000,000 for test 
        RandomAccessFile file1 = new RandomAccessFile(file, "rw");
        file1.setLength(100000000); //you can change size here
        file1.close();
    }
    
}

here is the result: `

a input byteStream sized 100000000 is created
use dataIuputStream to analyze byte stream:
  it takes 4489ms to loop 2 million times
use bufferedIuputStream to analyze byte stream:
  it takes 4686ms to loop 2 million times

so it seems like bufferedStream is slower? But when I changed the size of input byte stream to 400M(by changing test file size to 400M) the result comes to:`

    a input byteStream sized 400000000 is created
use dataIuputStream to analyze byte stream:
  it takes 4740ms to loop 2 million times
use bufferedIuputStream to analyze byte stream:
  it takes 1384ms to loop 2 million times

So it seems like the performance of bufferedInputstream depends on the buffersize. Anyway the time cost is too high.

  • Looks like increasing the file size extended the execution enough to allow JIT to finally trigger, and speed up the code. You need to warm up the JVM and cause all the code to be JIT-optimized, before you can compare performance benchmarks. See duplicate link up top. – Andreas Dec 28 '20 at 16:41
  • thanks,it seems like I didn't warm up my JVM. The time cost comes to 600ms by running several times in one execution. – monozuki62 Dec 29 '20 at 02:42

0 Answers0