I'm trying to run the python multiprocessing library to speed up encoding of csv file. However I run into this error:
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
I did create
Search = SemanticSearch(model_path, data_path, query)
if __name__ == '__main__':
query, flat, top_results = Search.search()
That points to the function in my class,
def setup(self):
with open(self.data_path, newline='') as f: # read and sort data
reader = csv.reader(f)
data1 = list(reader)
self.corpus = [x for sublist in data1 for x in sublist] # turn into 1D list
#SemanticSearch.encode(self)
self.texts_encodings = self.map(self.encode, self.corpus)
end = time.time()
print(end - self.start)
def encode(self):
self.start = time.time()
return self.model.encode(self.corpus, convert_to_tensor=True, show_progress_bar=True)
In my init function I did call and set
self.map = Pool().map
Any tips something I'm missing? Thanks in advance
EDIT
class SemanticSearch(object):
def __init__(self, model, data, query):
self.query = query
self.model = SentenceTransformer(model) ### Model location
self.data_path = data ###path to csv
self.corpus = None
self.texts_encodings = None
self.start = None
self.map = Pool().map
def setup(self):
print('here')
with open(self.data_path, newline='') as f: # read and sort data
reader = csv.reader(f)
data1 = list(reader)
self.corpus = [x for sublist in data1 for x in sublist] # turn into 1D list
# SemanticSearch.encode(self)
self.texts_encodings = self.map(self.encode, self.corpus)
# SemanticSearch.encode(self)
end = time.time()
print(end - self.start)
def encode(self):
self.start = time.time()
return self.model.encode(self.corpus, convert_to_tensor=True,
show_progress_bar=True) ##encode to invisible layer
def search(self):
SemanticSearch.setup(self)
if __name__ == "__main__":
model_path = r'data\BERT_MODELS\fine-tuned\multi-qa-MiniLM-L6-cos-v1'
data_path = 'data/raw_data/Jira-2_14_2022.csv'
query = 'query'
Search = SemanticSearch(model_path, data_path, query)
query, flat, top_results = Search.search()