I need to write a function, get_words_from_file(filename), that returns a list of lower case words. Your function should only process lines between the start and end marker lines. The words should be in the same order as they occur in the file. Here's a sample text file: baboosh.txt:
*** START OF TEST CASE ***
......list of sentences here.....
*** END OF TEST CASE ***
This is after the end and should be ignored too.
Here's what I came up with:
import re
from string import punctuation
def stripped_lines(lines):
for line in lines:
stripped_line = line.rstrip('\n')
yield stripped_line
def lines_from_file(fname):
with open(fname, 'rt') as flines:
for line in stripped_lines(flines):
yield line
def is_marker_line(line, start='***', end='***'):
'''
Marker lines start and end with the given strings, which may not
overlap. (A line containing just '***' is not a valid marker line.)
'''
min_len = len(start) + len(end)
if len(line) < min_len:
return False
return line.startswith(start) and line.endswith(end)
def advance_past_next_marker(lines):
'''
'''
for line in lines:
if is_marker_line(line):
break
def lines_before_next_marker(lines):
valid_lines = []
for line in lines:
if is_marker_line(line):
break
line.replace('"', '')
valid_lines.append(line)
for content_line in valid_lines:
yield content_line
def lines_between_markers(lines):
'''
Yields the lines between the first two marker lines.
'''
it = iter(lines)
advance_past_next_marker(it)
for line in lines_before_next_marker(it):
yield line
def words(lines):
text = '\n'.join(lines).lower().split()
return text
def get_words_from_file(fname):
return words(lines_between_markers(lines_from_file(fname)))
#This is the test code that must be executed
filename = "baboosh.txt"
words = get_words_from_file(filename)
print(filename, "loaded ok.")
print("{} valid words found.".format(len(words)))
print("Valid word list:")
for word in words:
print(word)
My Output
I am getting the correct word list. But when printed, I am getting punctuation such as colons, semi-colons, and periods. I dont know how else to get rid of these.
How can I do this?