0

This a program which presents how many times does each word occur within a text file. what is going on is that its also picking up characters like ? and , i only want it to pick letters. This is just part of the results {"1"=1, "Cheers"=1, "Fanny"=1, "I=1, "biscuits"=1, "chairz")=1, "cheeahz"=1, "crisps"=1, "jumpers"=1, ?=20, work:=1

import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.TreeMap;
import java.util.StringTokenizer;

public class Unigrammodel {

public static void main(String [] args){

    //Creating BufferedReader to accept the file name from the user
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));

    String fileName = null;
    System.out.print("Please enter the file name with path: ");
    try{
        fileName = (String) br.readLine();

        //Creating the BufferedReader to read the file
        File textFile = new File(fileName);
        BufferedReader input = new BufferedReader(new FileReader(textFile));

        //Creating the Map to store the words and their occurrences
        TreeMap<String, Integer> frequencyMap = new TreeMap<String, Integer>();
        String currentLine = null;

        //Reading line by line from the text file
        while((currentLine = input.readLine()) != null){

            //Parsing the words from each line
            StringTokenizer parser = new StringTokenizer(currentLine); 
            while(parser.hasMoreTokens()){
                String currentWord = parser.nextToken();




                //remove all non-alphanumeric from this word

            currentWord.replaceAll(("[^A-Za-z0-9 ]"), "");

                Integer frequency = frequencyMap.get(currentWord); 
                if(frequency == null){
                    frequency = 0;                      
                }
                //Putting each word and its occurrence into Map 
                frequencyMap.put(currentWord, frequency + 1);
            }

        }

        //Displaying the Result

        System.out.println(frequencyMap +"\n");

    }catch(IOException ie){
        ie.printStackTrace();
        System.err.println("Your entered path is wrong");
    }       

}

}

Muneeb Khan
  • 87
  • 1
  • 4
  • 17
  • Check [Regular Expression for alphanumeric and underscores](http://stackoverflow.com/questions/336210/regular-expression-for-alphanumeric-and-underscores). – sam Oct 14 '15 at 17:04

1 Answers1

1

Strings are immutable, so you need to assign the modified string to a variable before adding it to the map. String wordCleaned= currentWord.replaceAll(("[^A-Za-z0-9 ]"), ""); ... frequencyMap.put(wordCleaned, frequency + 1);

hasnae
  • 2,137
  • 17
  • 21
  • I just ran it with a different text file but i think thers a problem the regex . Since its still showing brackets and stuff. – Muneeb Khan Oct 14 '15 at 17:22
  • can you post an example of line from the second file? – hasnae Oct 14 '15 at 17:30
  • =1, 1=1, 125102050=1, 14=1, 1800=1, 4=1, 456=1, 50=1, 5000=1, 6=1, 60=1, 6pm=1, AC=1, After=1, Again=1, All=1, Almost=1, Always=1, America=1, American=1, Americans=2, Avoid=1, BBC=2, Beer=1, – Muneeb Khan Oct 14 '15 at 17:53
  • see i don't want these numbers.it also starts with = since the text file does have = mentioned but i don't want it to pick it up. – Muneeb Khan Oct 14 '15 at 17:54
  • The display starts with an = because in your regular expression you have skipped also white space. – hasnae Oct 16 '15 at 14:01