2

I'm new to python. So I appreciate your help to help me to see what are some issue with my current implementation of the histogram. My goal is to implementing a histogram to calculate the frequencies and probabilities of the words happen given an input.

I used a dictionary data structure in my program to first solve this problem. Then I tried to rewrite it using only the list data structures. It does not seem to make that much difference between the two approaches. I see that there are different approaches to solve this problem. My 2nd question is about using the appropriate data structure to solve this problem. Is there any tradeoff with different sampling techniques?

import sys
import re
import random
import time

# Analyze word frequency in a histogram
# sample words according to their observed frequencies
# compare tradeoff with different sampling techniques
#   1. # use dictionary to create my histogram key and value pairs inside my histogram
# 2. use list of list to create one solution to solve this problem
# input: source_text has a string of  
# input source "one fish two fish red fish blue fish"
def histogram(source_text):
    histogram = {}
    # python ways of removing any sort of string, removing ,:;, and any other special character
    for word in source_text.split():
        word = re.sub('[.,:;!-[]?', '', word)

        if word in histogram:
            histogram[word] += 1
        else:
            histogram[word] = 1
    return histogram

def random_word(histogram):
    probability = 0
    rand_index = random.randint(1, sum(histogram.values()))
    # Algorithm 1
    # for (key, value) in histogram.items():
    #     for num in range(1, value + 1):
    #         if probability == rand_index:
    #             if key in outcome_gram:
    #                 outcome_gram[key] += 1
    #             else:
    #                 outcome_gram[key] = 1
    #             # return outcome_gram
    #             return key
    #         else:
    #             probability += 1
    # Algorithm 2
    for word in histogram:
        probability += histogram[word]
        if probability >= rand_index:
            if word in outcome_gram:
                outcome_gram[word] += 1
            else:
                outcome_gram[word] = 1
            return word

def unique_words(histogram):
    """Return the amount of words that only show up once in the histogram."""
    #  uniqueWords is a list that stores the words that only happens once
    uniqueWords = []
    for histogramWord in histogram:
        if histogramWord[1] == 1:
            uniqueWords.append(histogramWord)
    return(len(uniqueWords))

def frequency(word, histogram):
    """Return the amount of times a word shows up in the histogram."""
    for histogramWord in histogram:
        if histogramWord[0] == word:
            return histogramWord[1]

if __name__ == "__main__":
    outcome_gram = {}
    dict = open('./fish.txt', 'r')
    text = dict.read()
    dict.close()
    hist_dict = histogram(text)
    for number in range(1, 100000):
        random_word(hist_dict)

    print("If this were a perfect algorithm, the number of fish would be 50000, but my actual value is " + str(outcome_gram["fish"]))
    # for word, expected_count in hist_dict.items():
    print("The percent error is " + str(abs(outcome_gram["fish"] - 50000.0) / 50000.0 * 100.0) + "%")
    # outcome_gram["fish"] = abs(outcome_gram["fish"] - 5000.0) / 5000.0 * 100.0
    # calculate the frequencies of the words
    print (outcome_gram)

If this were a perfect algorithm, the number of fish would be 50000, #but my actual value is 49991

The percent error is 0.018%

output: {'blue': 12457, 'fish': 49991, 'two': 12454, 'red': 12631, 'one': 12466}

#here's some sample test. $ python sample.py one fish two fish red fish blue fish
#two

#$ python sample.py one fish two fish red fish blue fish
#fish

#$ python sample.py one fish two fish red fish blue fish
#blue

Thank you.

DataEngineer
  • 396
  • 1
  • 10
  • You can use `Counter` from the `collections` module. See [this](https://stackoverflow.com/questions/20510768/count-frequency-of-words-in-a-list-and-sort-by-frequency) question. – Roald Feb 27 '18 at 14:21
  • Possible duplicate of [Count frequency of words in a list and sort by frequency](https://stackoverflow.com/questions/20510768/count-frequency-of-words-in-a-list-and-sort-by-frequency) – Roald Feb 27 '18 at 14:21

0 Answers0