-1

Hey everyone I am very new to Python and to coding in general. Have an assignment where I need to read a txt, count the words, rank them and plot it into a graph. I've managed to do all except for the part of putting it all into graph. Here is my code, and an example of the way the list stores the characters that need to be plotted.

import nltk
import nltk.tokenize 
import collections
import numpy as np

from nltk.tokenize import word_tokenize

with open("en.txt") as file:    
    data = file.read()

word_tokenize_list = word_tokenize(data)

from collections import Counter
counts = Counter(word_tokenize_list)
print(counts)                       

Counter({',': 54224, 'the': 45990, '.': 42529, 'of': 25608, 'to': 24869, 'a': 21351, 'and': 17807, 'in': 17037, "'s": 10335, 'that': 8990, 'for': 8936, '$': 8218, '``': 7733, 'The': 7724, 'is': 7695, "''": 7510, 'said': 6462, 'on': 5718, '%': 5613, 'it': 5177, 'by': 5035, 'from': 4939, 'million': 4883})

My list is very big btw All I am asking for is a hint to what can be used, because plt.plot does not work for me in this case

RevanthKrishnaKumar V.
  • 1,855
  • 1
  • 21
  • 34
Ryne Ignelzy
  • 137
  • 1
  • 2
  • 13

3 Answers3

0

The plot most useful for this is probably a bar chart, which can be plotted from a dictionary directly using this answer,

import matplotlib.pyplot as plt

Counter = {',': 54224, 'the': 45990, '.': 42529, 'of': 25608, 'to': 24869, 'a': 21351, 'and': 17807, 'in': 17037, "'s": 10335, 'that': 8990, 'for': 8936, '$': 8218, '``': 7733, 'The': 7724, 'is': 7695, "''": 
7510, 'said': 6462, 'on': 5718, '%': 5613, 'it': 5177, 'by': 5035, 'from': 4939, 'million': 4883}

#Plot bar with values from dict and label with keys
plt.bar(range(len(Counter)), Counter.values(), align='center')
plt.xticks(range(len(Counter)), Counter.keys())

#Rotate labels by 90 degrees so you can see them
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)

plt.show()

which looks like,

enter image description here

Community
  • 1
  • 1
Ed Smith
  • 12,716
  • 2
  • 43
  • 55
0

Matplotlib is a very widely used graphing library which can be used with Python.

You will probably want first sort your counter data based on some ranking criteria, below there are two possible solutions:

from collections import Counter
import matplotlib.pyplot as plt

data = Counter({',': 54224, 'the': 45990, '.': 42529, 'of': 25608, 'to': 24869, 'a': 21351, 'and': 17807, 'in': 17037, "'s": 10335, 'that': 8990, 'for': 8936, '$': 8218, '``': 7733, 'The': 7724, 'is': 7695, "''": 7510, 'said': 6462, 'on': 5718, '%': 5613, 'it': 5177, 'by': 5035, 'from': 4939, 'million': 4883})    
xaxis = range(len(data))

keys_freq = []
values_freq = []

keys_length = []
values_length = []

# Rank depending on frequency
for key, value in data.most_common()[::-1]:
    keys_freq.append(key)
    values_freq.append(value)

# Rank depending on word length
for key in sorted(data.keys(), key=lambda x: (len(x), x)):
    keys_length.append(key)
    values_length.append(data[key])

fig = plt.figure()

plt.subplot(211)
plt.bar(xaxis, values_freq, align='center')
plt.xticks(xaxis, keys_freq)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)

plt.subplot(212)
plt.bar(xaxis, values_length, align='center')
plt.xticks(xaxis, keys_length)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)

fig.tight_layout()
plt.show()

Giving you:

Matplotlib screenshot

Martin Evans
  • 45,791
  • 17
  • 81
  • 97
0

In the end, here is what I did with help of friends

#Importing all the necessary libraries
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import string

#Opening/reading/editing file

filename=raw_input('Filename (e.g. yourfile.txt): ')
cond=raw_input('What do you want to count? \n A) Words.\n B) Characters and     Punctuation. \n Choice: ')
file=open(filename,'r')
#'r' allows us to read the file
text=file.read()
#This allows us to view the entire text and assign it as a gigantic string
text=text.lower()
'''We make the entire case lowercase to account for any words that have a capital    letter due to sentence structure'''
if cond in ['A','a','A)','a)']:
    set=['!', '#', '"', '%', '$',"''" '&', ')', '(', '+', '*', '--', ',', '/', '.', ';', ':', '=', '<', '?', '>', '@', '[', ']', '\\', '_', '^', '`', '{', '}', '|', '~']
    text="".join(l for l in text if l not in set)
    '''Hyphenated words are secure, since the text has set '--' as the dash.'''
    #Splitting the text into sepereate words, thus creating a big string array.
    text=text.split()
    #We then use the Counter function to calculate the frequency of each word appearing in the text.
    count=Counter(text)
    '''This is not enough, since count is now a function dependant from speicifc strings. We use the .most_common function to create an array which contains the word and it's frequency in each element.'''
    count=count.most_common()
    #Creating empty arrays, replace the 0 with our frequency values and plot it.    Along with the experimental data, we will take the averaged proportionality constant (K) and plot the curve y=K/x
    y=np.arange(len(count))
    x=np.arange(1,len(count)+1)
    yn=["" for m in range(len(count))]
    '''it is important to change the range from 1 to len(count), since the value  'Rank' always starts from 1.'''
    for i in range(len(count)):
        y[i]=count[i][1]
        yn[i]=count[i][0]
    K,Ks=round(np.average(x*y),2),round(np.std(x*y),2)
    plt.plot(x,y,color='red',linewidth=3)
    plt.plot(x,K/x,color='green',linewidth=2)
    plt.xlabel('Rank')
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
    plt.plot(0,0,'o',alpha=0)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.title("Testing Zipf's Law: the relationship between the frequency and rank of a word in a text")
    plt.legend(['Experimental data', 'y=K/x, K=%s, $\delta_{K}$ = %s'%(K,Ks),     'Most used word=%s, least used=%s'%(count[0],count[-1])], loc='best',numpoints=1)
    plt.show()
elif cond in ['B','b','B)','b)']:
    text=text.translate( None, string.whitespace )
    count=Counter(text)
    count=count.most_common()
    y=np.arange(len(count))
    x=np.arange(1,len(count)+1)
    yn=["" for m in range(len(count))]
    for i in range(len(count)):
        y[i]=count[i][1]
        yn[i]=count[i][0]
    K,Ks=round(np.average(x*y),2),round(np.std(x*y),2)
    plt.plot(x,y,color='red',linewidth=3)
    plt.plot(x,K/x,color='green',linewidth=2)
    plt.xlabel('Rank')
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
    plt.plot(0,0,'o',alpha=0)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.title("Testing Zipf's Law: the relationship between the frequency and rank of a character/punctuation,  in a text")
    plt.legend(['Experimental data', 'y=K/x, K=%s, $\delta_{K}$ = %s'%(K,Ks), 'Most used character=%s, least used=%s'%(count[0],count[-1])],       loc='best',numpoints=1)
    plt.show()
Ryne Ignelzy
  • 137
  • 1
  • 2
  • 13