This might endup in very silly question, but being a newbie in python i am not able to find a good solution to following problem.
class Preprocessor:
mPath = None;
df = None;
def __init__(self, path):
self.mPath = path;
def read(self):
self.df = pd.read_csv(self.mPath);
return self.df;
def __findUniqueGenres(self):
setOfGenres = set();
for index, genre in self.df['genres'].iteritems():
listOfGenreInMovie = genre.lower().split("|");
for i, _genre in np.ndenumerate(listOfGenreInMovie):
setOfGenres.add(_genre)
return setOfGenres;
def __prepareDataframe(self, genres):
all_columns = set(["title", "movieId"]).union(genres)
_df = pd.DataFrame(columns=all_columns)
return _df;
def __getRowTemplate(self, listOfColumns):
_rowTemplate = {}
for col in listOfColumns:
_rowTemplate[col] = 0
return _rowTemplate;
def __createRow(self, rowTemplate, row):
rowTemplate['title'] = row.title;
rowTemplate['movieId'] = row.movieId;
movieGenres = row.genres.lower().split("|");
for movieGenre in movieGenres:
rowTemplate[movieGenre] = 1;
return rowTemplate;
def tranformDataFrame(self):
genres = self.__findUniqueGenres();
print('### List of genres...', genres);
__df = self.__prepareDataframe(genres); # Data frame with all required columns.
rowTemplate = self.__getRowTemplate(__df.columns)
print('### Row template looks like -->', rowTemplate)
collection = []
for index, row in self.df.iterrows():
_rowToAdd=self.__createRow(rowTemplate, row);
print('### Row looks like', _rowToAdd)
collection.append(_rowToAdd)
print('### Collection looks like', collection)
return __df.append(collection)
Here when i am trying to append a _rowToAdd
to collection, it endsup having a collection of last item ( last row of self.df
).
Below are logs for the same (self.df
has 3 rows here),
### List of genres... {'mystery', 'horror', 'comedy', 'drama', 'thriller', 'children', 'adventure'}
### Row template looks like --> {'title': 0, 'horror': 0, 'comedy': 0, 'drama': 0, 'children': 0, 'mystery': 0, 'movieId': 0, 'thriller': 0, 'adventure': 0}
### Row looks like {'title': 'Big Night (1996)', 'horror': 0, 'comedy': 1, 'drama': 1, 'children': 0, 'mystery': 0, 'movieId': 994, 'thriller': 0, 'adventure': 0}
### Row looks like {'title': 'Grudge, The (2004)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 0, 'mystery': 1, 'movieId': 8947, 'thriller': 1, 'adventure': 0}
### Row looks like {'title': 'Cheetah (1989)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 1, 'mystery': 1, 'movieId': 2039, 'thriller': 1, 'adventure': 1}
### Collection looks like [{'title': 'Cheetah (1989)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 1, 'mystery': 1, 'movieId': 2039, 'thriller': 1, 'adventure': 1}, {'title': 'Cheetah (1989)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 1, 'mystery': 1, 'movieId': 2039, 'thriller': 1, 'adventure': 1}, {'title': 'Cheetah (1989)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 1, 'mystery': 1, 'movieId': 2039, 'thriller': 1, 'adventure': 1}]
I want my collection to like
### [
{'title': 'Big Night (1996)', 'horror': 0, 'comedy': 1, 'drama': 1, 'children': 0, 'mystery': 0, 'movieId': 994, 'thriller': 0, 'adventure': 0},
{'title': 'Grudge, The (2004)', 'horror': 1, 'comedy': 0, 'drama': 0, 'children': 0, 'mystery': 1, 'movieId': 8947, 'thriller': 1, 'adventure': 0},
{'title': 'Cheetah (1989)', 'horror': 0, 'comedy': 0, 'drama': 0, 'children': 1, 'mystery': 0, 'movieId': 2039, 'thriller': 0, 'adventure': 1}
]
Dataset - https://grouplens.org/datasets/movielens/