0
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import org.apache.commons.io.FileUtils;

public class indexer {

    @SuppressWarnings("unchecked")
    public static void main(String[] args) throws IOException{

        HindiStemmerLight shl = new      
        //HindiStemmerLight();                                      
        Scanner in1 = new Scanner(System.in);
        System.out.println("");
        System.out.println("Enter the File Path");

        String path= in1.next();


        File folder = new File(path);                               
        File[] listOfFiles = folder.listFiles();
        ArrayList<String> array = new ArrayList<String>();
        int count1 = 0 ;
        ArrayList<String> stopwords = new   
        ArrayList<String>();                                                                  File files = new File("/home/gaurav/stop-words_hindi_1_hi.txt");
        String stopWordsFile=FileUtils.readFileToString(files);
        String[] stopWords = stopWordsFile.split(" ");
        for(String str:stopWords){
            stopwords.add(str);
        }
        System.out.println("");

       for (int i = 0; i <listOfFiles.length; i++) {                                 //Reading the contents of each file


              File file = listOfFiles[i];            

              if (file.isFile() && file.getName().endsWith(".txt")) {
                String content = FileUtils.readFileToString(file);                      //storing the contents of files in content

        String[] a=content.split("");                                        
        for(String s:a){                   
                  s= s.trim();                      
                  if(stopwords.contains(s)){
                  }
                  else{
                    //shl.stem(s);                                                          //applying the hindi stemmer on each word
                   // if(!array.contains(s))                                            // storing each word encountered into arraylist - array
                   array.add(s);
                  }
                }

              }
        }

       Arrays.sort(listOfFiles, new Comparator()
       {
           @Override
           public int compare(Object f1, Object f2) {
               return ((File) f1).getName().compareTo(((File) f2).getName());
           }
       });


       Map<String, ArrayList<HashMap<String, Integer>>> words = new TreeMap<String, ArrayList<HashMap<String, Integer>>>();
       Collections.sort(array);
        for(int i =0 ; i<array.size();i++){
            String s = array.get(i);
            ArrayList<HashMap<String, Integer>> Hash = new ArrayList<HashMap<String, Integer>>();
            HashMap<String, Integer> doc =null;

            for(File newFile : listOfFiles){
                doc = new HashMap<String, Integer>();
                int count=0;    
                String DocId = newFile.getName();
                String c=FileUtils.readFileToString(newFile);
                String[] w = c.split(" ");
                    for(String s1 : w){
                        if(s.equals(s1)){
                            count++;
                        }
                    }
                    if(count != 0){ 
                    doc.put(DocId, count);
                    Hash.add(doc);
                    }
            }
                    words.put(s, Hash);
        }
        PrintStream out = new PrintStream(new FileOutputStream("output.txt"));
        System.setOut(out);
        for (String name: words.keySet()){

            String key =name.toString();
            String value = words.get(name).toString();  
            System.out.print(key + " " + value);
            System.out.println("");
        } 

I have made an Indexer Using Java but the problem is it is doing well when the Document(Corpus) is small in size. But when the size of corpus is 50,000 text files.It gives error (Out of memory: Java Heap space), and runs in a huge amount of time.Please suggest what are the changes that need to be done to make its complexity less.

2 Answers2

1

Index in smaller batches, don't keep whole dataset in memory.

Iouri Goussev
  • 368
  • 3
  • 9
0

There is no reason to read entire files into memory. Scan them a word at a time. And there is certainly no reason to read them twice.

user207421
  • 305,947
  • 44
  • 307
  • 483