import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import org.apache.commons.io.FileUtils;
public class indexer {
@SuppressWarnings("unchecked")
public static void main(String[] args) throws IOException{
HindiStemmerLight shl = new
//HindiStemmerLight();
Scanner in1 = new Scanner(System.in);
System.out.println("");
System.out.println("Enter the File Path");
String path= in1.next();
File folder = new File(path);
File[] listOfFiles = folder.listFiles();
ArrayList<String> array = new ArrayList<String>();
int count1 = 0 ;
ArrayList<String> stopwords = new
ArrayList<String>(); File files = new File("/home/gaurav/stop-words_hindi_1_hi.txt");
String stopWordsFile=FileUtils.readFileToString(files);
String[] stopWords = stopWordsFile.split(" ");
for(String str:stopWords){
stopwords.add(str);
}
System.out.println("");
for (int i = 0; i <listOfFiles.length; i++) { //Reading the contents of each file
File file = listOfFiles[i];
if (file.isFile() && file.getName().endsWith(".txt")) {
String content = FileUtils.readFileToString(file); //storing the contents of files in content
String[] a=content.split("");
for(String s:a){
s= s.trim();
if(stopwords.contains(s)){
}
else{
//shl.stem(s); //applying the hindi stemmer on each word
// if(!array.contains(s)) // storing each word encountered into arraylist - array
array.add(s);
}
}
}
}
Arrays.sort(listOfFiles, new Comparator()
{
@Override
public int compare(Object f1, Object f2) {
return ((File) f1).getName().compareTo(((File) f2).getName());
}
});
Map<String, ArrayList<HashMap<String, Integer>>> words = new TreeMap<String, ArrayList<HashMap<String, Integer>>>();
Collections.sort(array);
for(int i =0 ; i<array.size();i++){
String s = array.get(i);
ArrayList<HashMap<String, Integer>> Hash = new ArrayList<HashMap<String, Integer>>();
HashMap<String, Integer> doc =null;
for(File newFile : listOfFiles){
doc = new HashMap<String, Integer>();
int count=0;
String DocId = newFile.getName();
String c=FileUtils.readFileToString(newFile);
String[] w = c.split(" ");
for(String s1 : w){
if(s.equals(s1)){
count++;
}
}
if(count != 0){
doc.put(DocId, count);
Hash.add(doc);
}
}
words.put(s, Hash);
}
PrintStream out = new PrintStream(new FileOutputStream("output.txt"));
System.setOut(out);
for (String name: words.keySet()){
String key =name.toString();
String value = words.get(name).toString();
System.out.print(key + " " + value);
System.out.println("");
}
I have made an Indexer Using Java but the problem is it is doing well when the Document(Corpus) is small in size. But when the size of corpus is 50,000 text files.It gives error (Out of memory: Java Heap space), and runs in a huge amount of time.Please suggest what are the changes that need to be done to make its complexity less.