I've been working on my python skills. Here's a raw text file for data I'm working on: Titanic data
Each row stands for one person on board. The file has several columns, including whether the person survived or not (3rd column). I'm trying to count the number of people from each demographic on board (i.e. how many males and how many females) and the number of the survivors from each group.
I'm trying to do this in three stages: First, adding a column for the prefix relevant to the person (Mr, Mrs, Miss). Then, defined a function - get_avg() to identify the column the information would be found and the possible values of that column, feeding them to the grab_values function. Third, grab_values() counts the number of people from each group and the number of survivors.
That's all nice and dandy... but it doesn't work. I keep getting 0s for counts and sums. Tried sticking in a print command wherever possible and made some progress, but still can't understand what I should do. I have a feeling like the function isn't running on all rows (or any of them), but don't know if that's really the reason and what to do about it.
Could anyone please help?
import csv
titanic = open('shorttitanic.txt', "rb")
reader = csv.reader(titanic)
prefix_list = ["Mr ", "Mrs", "Mis"] # used to determine if passanger's name includes a prefix
# There are several demographic details we can count passengers and survivors with, this is a dictionary to map them out along with col number.
details = {"embarked":[5, "Southampton", "Cherbourg", "Queenstown", ""],
"sex":[10, "male","female"], "pclass":[1,"1st","2nd","3rd"],
"prefix":[12,"Mr ", "Mrs", "Mis"]} # first item is col number (starts at 0), other items are the possible values
# Adding another column for prefix:
rownum = 0
for row in reader:
# Finding the header:
if rownum == 0:
header = row
header.append("Prefix")
# print header
else:
prefix_location = row[3].find(",") + 2 # finds the position of the comma, the prefix starts after the comma and after a space (+2)
prefix = row[3][prefix_location:prefix_location+3] # grabs the 3 first characters of the prefix
# print len(prefix), prefix
if prefix in prefix_list: # if there's a prefix in the passanger's name, it's appended to the row
if prefix == "Mis":
row.append("Miss") # Mis is corrected to Miss on appending, since we must work with 3 chars
else:
row.append(prefix)
else:
row.append("Other/Unknown") # for cases where there's no prefix in the passanger's name
# print len(row), rownum, row[3], prefix, row[11]
# print row
rownum += 1
# grab_values() will run on all rows and count the number of passengers in each demographic and the number of survivors
def grab_values(col_num,i):
print col_num, "item name", i
count = 0
tot = 0
for row in reader:
# print type(row[col_num][0]
print row[col_num]
if row[col_num] == i:
count += 1
if row[2] == int(1):
tot += 1
# print count, tot
return count, tot
# get_avg() finds the column number and possible values of demographic x.
def get_avg(x): # x is the category (sex, embarked...)
col_num = details[x][0]
for i in details[x][1:]:
print col_num, i
# print type(i)
grab_values(col_num,i)
count,tot = grab_values(col_num,i)
print count,tot
# print i, count, tot
get_avg("sex")
titanic.close()
EDIT: changed the prefix values in the dict to: "prefix":[12,"Mr ", "Mrs", "Mis"]}, which had to be done.
EDIT2: Here's the finished code, in case anybody's interested. I took the advice of warunsl regarding the nature of the problem, but his solution didn't work, at least when I made the changes, so I can't pick it as the right solution in case others will ever find this thread and try to learn from it. Many thanks to the helpers!
import csv
titanic = open('titanic.txt', "rb")
reader = csv.reader(titanic)
prefix_list = ["Mr ", "Mrs", "Mis"] # used to determine if passanger's name includes a prefix. Using 3 chars because of Mr.
# There are several demographic details we can count passengers and survivors with, this is a dictionary to map them out along with col number.
details = {"embarked":[5, "Southampton", "Cherbourg", "Queenstown", ""],
"sex":[10, "male","female"], "pclass":[1,"1st","2nd","3rd"],
"prefix":[11,"Mr ", "Mrs", "Miss", "Unknown"]} # first item is col number (starts at 0), other items are the possible values
# try to see how the prefix values can be created by using 11 and a refernce to prefix_list
# Here we'll do 2 things:
# I - Add another column for prefix, and -
# II - Create processed_list with each of the rows in reader, since we can only run over reader once,
# and since I don't know much about handling CSVs or generator yet we'll run on the processed_list instead
processed_list = []
rownum = 0
for row in reader:
# Finding the header:
if rownum == 0:
header = row
header.append("Prefix")
else:
prefix_location = row[3].find(",") + 2 # finds the position of the comma, the prefix starts after the comma and after a space (+2)
prefix = row[3][prefix_location:prefix_location+3] # grabs the 3 first characters of the prefix
if prefix in prefix_list: # if there's a prefix in the passanger's name, it's appended to the row
if prefix == "Mis":
row.append("Miss") # Mis is corrected to Miss on appending, since we must work with 3 chars
else:
row.append(prefix)
else:
row.append("Unknown") # for cases where there's no prefix in the passanger's name
processed_list.append(row)
rownum += 1
# grab_values() will run on all rows and count the number of passengers in each demographic and the number of survivors
def grab_values(col_num,i):
# print col_num, "item name", i
num_on_board = 0
num_survived = 0
for row in processed_list:
if row[col_num] == i:
num_on_board += 1
if row[2] == "1":
num_survived += 1
return num_on_board, num_survived
# get_avg() finds the column number and possible values of demographic x.
def get_avg(x): # x is the category (sex, embarked...)
col_num = details[x][0]
for i in details[x][1:]:
print "Looking for: ", i, "at col num: ", col_num
grab_values(col_num,i)
num_on_board,num_survived = grab_values(col_num,i)
try:
proportion_survived = float(num_survived)/num_on_board
except ZeroDivisionError:
proportion_survived = "Cannot be calculated"
print "Number of %s passengers on board: " %i , num_on_board, "\n" \
"Number of %s passengers survived: " %i, num_survived, "\n" \
"Proportion of %s passengers survived: " %i, "%.2f%%" % (proportion_survived * 100), "\n"
print "Hello! I can calculate the proportion of passengers that survived according to these parameters: \n \
Embarked \n Sex \n Pclass \n Prefix", "\n"
def get_choice():
possible_choices = ["embarked","sex","pclass","prefix"]
choice = raw_input("Please enter your choice: ").lower()
if choice not in possible_choices:
print "Sorry, I can only work with Embarked / Sex / Pclass / Prefix. Please try again."
get_choice()
return choice
user_choice = get_choice()
get_avg(user_choice)
titanic.close()