When I cluster on 200.000 x 4 everything seems to be fine and I get good results, but as soon as I reach out to 500.000 x 4 it flops. The entire table is 9.800.000 x 4 so I'm not even close yet.
Looked online for solutions but haven't been able to find any.
I'm not a coder by any means so the lines that are written are not difficult (nor are they written by myself) but not sure where else to go for an answer to my question.
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
data = pd.read_csv(r'C:\Users\David\Documents\Kutscriptie\hardcoreg2.csv')
data.fillna('0', inplace=True)
data = data.head(500000)
data_cluster = StandardScaler().fit_transform(data)
db = DBSCAN(eps=0.5, min_samples=6).fit(data_cluster)
MemoryError Traceback (most recent call last)
<ipython-input-6-b09830897f6a> in <module>
----> 1 db = DBSCAN(eps=0.5, min_samples=6).fit(data_cluster)
~\anaconda3\lib\site-packages\sklearn\cluster\_dbscan.py in fit(self, X, y, sample_weight)
333 # This has worst case O(n^2) memory complexity
334 neighborhoods = neighbors_model.radius_neighbors(X,
--> 335 return_distance=False)
336
337 if sample_weight is None:
~\anaconda3\lib\site-packages\sklearn\neighbors\_base.py in radius_neighbors(self, X, radius, return_distance, sort_results)
973 sort_results=sort_results)
974
--> 975 for s in gen_even_slices(X.shape[0], n_jobs)
976 )
977 if return_distance:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\sklearn\neighbors\_base.py in _tree_query_radius_parallel_helper(tree, *args, **kwargs)
786 cloudpickle under PyPy.
787 """
--> 788 return tree.query_radius(*args, **kwargs)
789
790
sklearn\neighbors\_binary_tree.pxi in sklearn.neighbors._kd_tree.BinaryTree.query_radius()
sklearn\neighbors\_binary_tree.pxi in sklearn.neighbors._kd_tree.BinaryTree.query_radius()
MemoryError: