I am new to python and machine learning. I successfully tested DBN.py examples from deeplearing. now I want to put my own set of images into mnist.pkl.gz format
I already tried some code from a project named JPG-PNG-to-MNIST-NN-Format on github but it gives me idx format I used some code to convert this idx format to mnist.pkl but I found that there should be a validation_set images which is not presented in JPG-PNG-to-MNIST-NN-Format and my DBN.py code gives me error "ran out of input" I even tried this How to put my dataset in a .pkl file in the exact format and data structure used in "mnist.pkl.gz"? but I dont know how to prepare *.csv labels. this is my code
from PIL import Image
from numpy import genfromtxt
import gzip, cPickle
from glob import glob
import numpy as np
import pandas as pd
def dir_to_dataset(glob_files, loc_train_labels=""):
print("Gonna process:\n\t %s"%glob_files)
dataset = []
for file_count, file_name in enumerate( sorted(glob(glob_files),key=len) ):
image = Image.open(file_name)
img = Image.open(file_name).convert('LA') #tograyscale
pixels = [f[0] for f in list(img.getdata())]
dataset.append(pixels)
if file_count % 1000 == 0:
print("\t %s files processed"%file_count)
# outfile = glob_files+"out"
# np.save(outfile, dataset)
if len(loc_train_labels) > 0:
df = pd.read_csv(loc_train_labels)
return np.array(dataset), np.array(df["class"])
else:
return np.array(dataset)
Data1, y1 = dir_to_dataset("train\\*.png","train.csv")
Data2, y2 = dir_to_dataset("valid\\*.png","valid.csv")
Data3, y3 = dir_to_dataset("test\\*.png","test.csv")
# Data and labels are read
train_set_x = Data1[:7717]
train_set_y = y1[:7717]
val_set_x = Data2[:1653]
val_set_y = y2[:1653]
test_set_x = Data3[:1654]
test_set_y = y3[:1654]
# Divided dataset into 3 parts. I had 6281 images.
train_set = train_set_x, train_set_y
val_set = val_set_x, val_set_y
test_set = test_set_x, val_set_y
dataset = [train_set, val_set, test_set]
f = gzip.open('mnist.pkl.gz','wb')
cPickle.dump(dataset, f, protocol=2)
f.close()
but I get these errors
Gonna process:
train\*.png
Traceback (most recent call last):
File "to-mnist.py", line 27, in <module>
Data1, y1 = dir_to_dataset("train\\*.png","train.csv")
File "to-mnist.py", line 22, in dir_to_dataset
return np.array(dataset), np.array(df["class"])
File "/home/alireza/.local/lib/python2.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/home/alireza/.local/lib/python2.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'class'
I think this has something to do with my *.csv files. the *.csv files are normal txt document with class of 0 and 1 in it. something like this
0
0
0
0
0
0
1
1
1
1