4

I got this below program from an coding site.

The following code read text file and find duplicate words.

To read from each text files and display it's duplicate words count line by line. And how to call that files if it is not stored as String, I used buffered reader but I am not getting my output.

My questions:

  1. How can I make the program read multiple files from given folder?

  2. How to save the results in Excel file format?

Any suggestions Welcomed.

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;


public class MaxDuplicateWordCount {

    public Map<String, Integer> getWordCount(String fileName){

        FileInputStream fis = null;
        DataInputStream dis = null;
        BufferedReader br = null;
        Map<String, Integer> wordMap = new HashMap<String, Integer>();

        try {
            fis = new FileInputStream(fileName);
            dis = new DataInputStream(fis);
            br = new BufferedReader(new InputStreamReader(dis));
            String line = null; 
            while((line = br.readLine()) != null){
                StringTokenizer st = new StringTokenizer(line, " ");
                while(st.hasMoreTokens()){
                    String tmp = st.nextToken().toLowerCase();
                    if(wordMap.containsKey(tmp)){
                        wordMap.put(tmp, wordMap.get(tmp)+1);
                    } else {
                        wordMap.put(tmp, 1);
                    }
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally{
            try{if(br != null) br.close();}catch(Exception ex){}
        }
        return wordMap;
    }

    public List<Entry<String, Integer>> sortByValue(Map<String, Integer> wordMap){

        Set<Entry<String, Integer>> set = wordMap.entrySet();
        List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(set);
        Collections.sort( list, new Comparator<Map.Entry<String, Integer>>()
        {
            public int compare( Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2 )
            {
                return (o2.getValue()).compareTo( o1.getValue() );
            }
        } );
        return list;
    }

    public static void main(String a[]){



        MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
        Map<String, Integer> wordMap = mdc.getWordCount("E:\\Blog 39.txt");

        List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
        for(Map.Entry<String, Integer> entry:list){
            System.out.println(entry.getKey()+" ="+entry.getValue());
        }
    }
}
Maytham Fahmi
  • 31,138
  • 14
  • 118
  • 137
Ram Ki
  • 282
  • 3
  • 16

2 Answers2

2

Intro

After chatting with OP, here is briefly what OP requires:

1- Read file/s from specific folder, files are typically Unicode as text files.
2- The files will be process in OP Algorithm in the Question, and the results of the Algorithm should be saved on Unicode file again (Later OP asked to be saved as Excel file (.XLS) because of Unicode compatibility with Excel)

Solution

This can be solved in following steps:

step 1 We define (declare) our work-space
step 2 We create output folder in work-space if not exist
step 3 We read all existing files in work-space folder and process them in the Algorithm.
step 4 The results of each file will saved as Excel file in output folder.

The code

First of all you need to import POI package, this will allow you to create XLS sheet. I have downloaded this poi/poi-3.5-FINAL.jar.zip( 1,372 k) and the following imports should added to your code.

import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFRow;

Next you added following code to your code, it is self explainable code:

final static String WORKSPACE = "C:/testfolder/";

private static void createOutputFolder(String outputFolderName) {
    File outputDirectory = new File(WORKSPACE + outputFolderName);

    if (!outputDirectory.exists()) {
        try {
            outputDirectory.mkdir();
        } catch (Exception e) {
        }
    }
}

private static void exlCreator() {

    String outputFolder = "output/";
    String fileName, fileNameWPathInput;
    int serialNumber = 1;
    createOutputFolder(outputFolder);

    MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
    File folder = new File(WORKSPACE);
    File[] listOfFiles = folder.listFiles();

    for (int i = 0; i < listOfFiles.length; i++) {
        if (listOfFiles[i].isFile()) {
            fileName = listOfFiles[i].getName();
            fileNameWPathInput = WORKSPACE + fileName;
            Map<String, Integer> wordMap = mdc.getWordCount(fileNameWPathInput);
            List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
            String fileNameWPathOutput = WORKSPACE + outputFolder +
                    fileName.substring(0, fileName.length() - 4)
                    + "output.xls";
            try {
                HSSFWorkbook workbook = new HSSFWorkbook();
                HSSFSheet sheet = workbook.createSheet("ResultSheet");

                HSSFRow rowhead = sheet.createRow((short) 0);
                rowhead.createCell(0).setCellValue("Serial No.");
                rowhead.createCell(1).setCellValue("Word");
                rowhead.createCell(2).setCellValue("Count");

                for (Map.Entry<String, Integer> entry : list) {
                    HSSFRow row = sheet.createRow((short) serialNumber);
                    row.createCell(0).setCellValue(serialNumber);
                    row.createCell(1).setCellValue(entry.getKey());
                    row.createCell(2).setCellValue(entry.getValue());
                    serialNumber++;
                }
                FileOutputStream fileOut = new FileOutputStream(fileNameWPathOutput);
                workbook.write(fileOut);
                fileOut.close();
                serialNumber = 1;
                System.out.println(fileNameWPathOutput + " is created");

            } catch (Exception ex) {
                System.out.println(ex);
            }
        }
    }


}

public static void main(String [] args) throws IOException {
    exlCreator();
}

Finally

By manipulating the code, it is possible to create one output file but create each output results in work sheets. As you can see in the image below, the output file is opened in Excel showing Unicode text with out problem, as it was the issue in my first solution: enter image description here

Links

Download POI
POI documentation
Unicode problem in CSV
More about CSV

Full code, requested from OP

import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;
//for Excel ark
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFRow;

public class MaxDuplicateWordCount {

    public Map<String, Integer> getWordCount(String fileName) {

        FileInputStream fis = null;
        DataInputStream dis = null;
        BufferedReader br = null;
        Map<String, Integer> wordMap = new HashMap<String, Integer>();

        try {
            fis = new FileInputStream(fileName);
            dis = new DataInputStream(fis);
            br = new BufferedReader(new InputStreamReader(dis));
            String line = null;
            while ((line = br.readLine()) != null) {
                StringTokenizer st = new StringTokenizer(line, " ");
                while (st.hasMoreTokens()) {
                    String tmp = st.nextToken().toLowerCase();
                    if (wordMap.containsKey(tmp)) {
                        wordMap.put(tmp, wordMap.get(tmp) + 1);
                    } else {
                        wordMap.put(tmp, 1);
                    }
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (br != null) br.close();
            } catch (Exception ex) {
            }
        }
        return wordMap;
    }

    public List<Entry<String, Integer>> sortByValue(Map<String, Integer> wordMap) {

        Set<Entry<String, Integer>> set = wordMap.entrySet();
        List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(set);
        Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {

            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {

                return (o2.getValue()).compareTo(o1.getValue());
            }


        });
        return list;
    }

    final static String WORKSPACE = "C:/testfolder/";

    private static void createOutputFolder(String outputFolderName) {
        File outputDirectory = new File(WORKSPACE + outputFolderName);

        if (!outputDirectory.exists()) {
            try {
                outputDirectory.mkdir();
            } catch (Exception e) {
            }
        }
    }

    private static void exlCreator() {

        String outputFolder = "output/";
        String fileName, fileNameWPathInput;
        int serialNumber = 1;
        createOutputFolder(outputFolder);

        MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
        File folder = new File(WORKSPACE);
        File[] listOfFiles = folder.listFiles();

        for (int i = 0; i < listOfFiles.length; i++) {
            if (listOfFiles[i].isFile()) {
                fileName = listOfFiles[i].getName();
                fileNameWPathInput = WORKSPACE + fileName;
                Map<String, Integer> wordMap = mdc.getWordCount(fileNameWPathInput);
                List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
                String fileNameWPathOutput = WORKSPACE + outputFolder +
                        fileName.substring(0, fileName.length() - 4)
                        + "output.xls";
                try {
                    HSSFWorkbook workbook = new HSSFWorkbook();
                    HSSFSheet sheet = workbook.createSheet("ResultSheet");

                    HSSFRow rowhead = sheet.createRow((short) 0);
                    rowhead.createCell(0).setCellValue("Serial No.");
                    rowhead.createCell(1).setCellValue("Word");
                    rowhead.createCell(2).setCellValue("Count");

                    for (Map.Entry<String, Integer> entry : list) {
                        HSSFRow row = sheet.createRow((short) serialNumber);
                        row.createCell(0).setCellValue(serialNumber);
                        row.createCell(1).setCellValue(entry.getKey());
                        row.createCell(2).setCellValue(entry.getValue());
                        serialNumber++;
                    }
                    FileOutputStream fileOut = new FileOutputStream(fileNameWPathOutput);
                    workbook.write(fileOut);
                    fileOut.close();
                    serialNumber = 1;
                    System.out.println(fileNameWPathOutput + " is created");

                } catch (Exception ex) {
                    System.out.println(ex);
                }
            }
        }


    }

    public static void main(String[] args) throws IOException {
        exlCreator();
    }
}
Community
  • 1
  • 1
Maytham Fahmi
  • 31,138
  • 14
  • 118
  • 137
  • Exception in thread "main" java.lang.Error: Unresolved compilation problems: Syntax error on token ",", TypeArgument1 expected after this token Syntax error on token "=", <= expected Syntax error on token(s), misplaced construct(s) Syntax error, insert ">>" to complete ReferenceType2 Syntax error, insert "( )" to complete Expression Syntax error, insert ")" to complete MethodInvocation Syntax error, insert ";" to complete Statement Syntax error, insert "}" to complete MethodBody at ramki.maxoccurrence.main(maxoccurrence.java:38) I got these errors :/ – Ram Ki Feb 01 '16 at 09:07
  • Yeah I have changed my compilation to JAVA8 too – Ram Ki Feb 01 '16 at 09:09
  • @RamKi please find my updated answer that generates excel ark in stead of csv file. – Maytham Fahmi Feb 08 '16 at 08:42
  • Have you imported POI? – Maytham Fahmi Feb 08 '16 at 09:37
  • Yeah got that exactly :) – Ram Ki Feb 08 '16 at 09:46
  • Yes working but still looking to integrate it! you may see this query with my gui interface "http://stackoverflow.com/questions/35266728/how-to-add-check-box-in-textarea-using-swing-and-destination-output-folderjfile" – Ram Ki Feb 08 '16 at 10:00
  • I will look at it later this week. – Maytham Fahmi Feb 08 '16 at 10:07
1

Let say you have a directory with all the files you want to read from.

File folder = new File("/Users/you/folder/");
File[] listOfFiles = folder.listFiles();

for (File file : listOfFiles) {

    if (file.isFile()) {
        /*
         * Here if your file is not a text file 
         * If I undersood you correct:
         *      "And how to call that files if it is not stored as String"
         * you can get it as byte[] and parse it to String
         */
        byte[] bytes = Files.readAllBytes(file.toPath());
        String decoded = new String(bytes, "UTF-8");
        String[] words = decoded.split("\\s+");
        for (int i = 0; i < words.length; i++) {
            /*  You may want to check for a non-word character before blindly
             *  performing a replacement
             *  It may also be necessary to adjust the character class
             */
             words[i] = words[i].replaceAll("[^\\w]", "");
             //Here are all the words from a file. You can do whatever you want with them
         }
     }

}
Lazar Lazarov
  • 2,412
  • 4
  • 26
  • 35
  • List list = new ArrayList(Arrays.asList("cat", "cat", "dog", "horse", "monkey", "zebra", "zebra", "dog", "dog", "dog", "fleas")); List list2 = new ArrayList(); Instead of this string could i load from directory,My code is working for the given string. can you make the code apply to this – Ram Ki Jan 20 '16 at 09:24