I'm trying to update a dataframe (self.df) with a column from a temp df(self.df_temp['linkedin_profile']) with the following class but it doesn't seem to update anything. The code:
class NameToSocialURLScraper:
def __init__(self, csv_file_name, person_name_column, organization_name_column):
self.proxy_list = PROXY_LIST
pool = Pool()
self.csv_file_name = csv_file_name
self.person_name_column = person_name_column
self.organization_name_column = organization_name_column
self.df = pd.read_csv(csv_file_name)
self.df_temp = pd.DataFrame()
def internal_linkedin_job(self):
self.df['linkedin_profile'] = np.nan
self.df_temp['linkedin_profile'] = np.nan
self.df_temp['linkedin_profile'] = self.df.apply(
lambda row: term_scraper(
str(row[self.person_name_column]) + " " + str(row[self.organization_name_column]), self.proxy_list,
'link', output_generic=False), axis=1)
self.df['linkedin_profile'] = self.df_temp['linkedin_profile']
print(self.df.values)
...
def multiprocess_job(self):
multiprocessing.log_to_stderr(logging.DEBUG)
linkedin_profile_proc = Process(target=self.internal_linkedin_job, args=())
jobs = [linkedin_profile_proc]
# Start the processes (i.e. calculate the random number lists)
for j in jobs:
j.start()
# Ensure all of the processes have finished
for j in jobs:
j.join()
When printing inside internal_linkedin_job it shows the df with the new column 'linkedin_profile' but when I print after j.join() the column isn't there.