I'm trying to write some code processing JSON document with extremely long string values (longer than 1 billion characters) stored in file. I don't want to keep whole strings in memory (since I can process them in stream). But I can't find such option in Jackson parser. What I've done so far is this test using Jackson token offsets (first round of reading file) and random access file to process strings in stream (second round of reading file):
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.MappingJsonFactory;
public class LongStringJsonTest {
public static void main(String[] args) throws Exception {
File tempJson = new File("temp.json");
PrintWriter pw = new PrintWriter(tempJson);
pw.print("{\"k1\": {\"k11\": \"");
for (int i = 0; i < 1e8; i++)
pw.print("abcdefghij");
pw.print("\"}, \"k2\": \"klmnopqrst\", " +
"\"k3\": [\"uvwxyz\", \"0123\"]}");
pw.close();
searchForStrings(tempJson);
}
private static void searchForStrings(File tempJson) throws Exception {
JsonFactory f = new MappingJsonFactory();
JsonParser jp = f.createParser(tempJson);
Map<Long, Long> stringStartToNext = new HashMap<Long, Long>();
long lastStringStart = -1;
boolean wasFieldBeforeString = false;
while (true) {
JsonToken token = jp.nextToken();
if (token == null)
break;
if (lastStringStart >= 0) {
stringStartToNext.put(lastStringStart, (wasFieldBeforeString ? -1 : 1) *
jp.getTokenLocation().getByteOffset());
lastStringStart = -1;
wasFieldBeforeString = false;
}
if (token == JsonToken.FIELD_NAME) {
wasFieldBeforeString = true;
} else if (token == JsonToken.VALUE_STRING) {
lastStringStart = jp.getTokenLocation().getByteOffset();
} else {
wasFieldBeforeString = false;
}
}
jp.close();
jp = f.createParser(tempJson);
RandomAccessFile raf = new RandomAccessFile(tempJson, "r");
while (true) {
JsonToken token = jp.nextToken();
if (token == null)
break;
if (token == JsonToken.VALUE_STRING) {
long start = jp.getTokenLocation().getByteOffset();
long end = stringStartToNext.get(start);
// You are able to process stream without keeping all bytes in memory.
// Here you see strings including quotes around them.
final long[] length = new long[] {0};
ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStream os = new OutputStream() {
@Override
public void write(int b) throws IOException {
throw new IOException("Method is not supported");
}
@Override
public void write(byte[] b, int off, int len)
throws IOException {
if (baos.size() < 20) {
baos.write(b, off, Math.min(len, 20));
baos.write((int)'.');
baos.write((int)'.');
baos.write((int)'.');
}
if (len > 0)
length[0] += len;
}
};
processString(raf, start, end, os);
String text = new String(baos.toByteArray(), Charset.forName("utf-8"));
System.out.println("String: " + text + ", length=" + length[0]);
}
}
jp.close();
raf.close();
}
private static void processString(RandomAccessFile raf, long start, long end,
OutputStream os) throws Exception {
boolean wasFieldBeforeString = end < 0;
int quoteNum = wasFieldBeforeString ? 3 : 1;
end = Math.abs(end);
byte[] buffer = new byte[10000];
raf.seek(start);
boolean afterBackSlash = false;
int strLen = (int)(end - start);
for (int chunk = 0; strLen > 0; chunk++) {
int ret = raf.read(buffer, 0, Math.min(buffer.length, strLen));
if (ret < 0)
break;
if (ret > 0) {
int offset = 0;
if (chunk == 0) {
// Assumption that key string doesn't contain double quotes
// and it's shorter than buffer size (for simplicity)
for (int n = 0; n < quoteNum; n++) {
while (true) {
if (buffer[offset] == '\"' && !afterBackSlash) {
break;
} else if (buffer[offset] == '\\') {
afterBackSlash = !afterBackSlash;
} else {
afterBackSlash = false;
}
offset++;
}
offset++;
}
offset--;
ret -= offset;
}
// Searching for ending quote
int endQuotePos = offset + (chunk == 0 ? 1 : 0); // Skip open quote
while (endQuotePos < offset + ret) {
if (buffer[endQuotePos] == '\"' && !afterBackSlash) {
break;
} else if (buffer[endQuotePos] == '\\') {
afterBackSlash = !afterBackSlash;
} else {
afterBackSlash = false;
}
endQuotePos++;
}
if (endQuotePos < offset + ret) {
os.write(buffer, offset, endQuotePos + 1 - offset);
break;
}
os.write(buffer, offset, ret);
strLen -= ret;
}
}
}
}
This approach doesn't support unicode at all. I'm curious is there any way to do it better (or even with help of some other libs)?