0

The following code runs fine. It gathers information per listing on LinkedIn.

(Account info given and free to use as it is a test account)

However, the output joins the data instead of each field having its own field.

I want the ouput printed in Excel with each field in the dictionary (Name, Company, Location) in its own column, with the outputs being in their own cell.

See attached for an example of expected output-

PictureLink

I have tried beautifulSoup but dont think that works.

import time
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
test1=[]
options = Options()
driver = webdriver.Chrome(ChromeDriverManager().install())

url = "https://www.linkedin.com/uas/login?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fsearch%2Fresults%2Fpeople%2F%3FcurrentCompany%3D%255B%25221252860%2522%255D%26geoUrn%3D%255B%2522103644278%2522%255D%26keywords%3Dsales%26origin%3DFACETED_SEARCH%26page%3D2&fromSignIn=true&trk=cold_join_sign_in"
driver.get(url)
time.sleep(2)

username = driver.find_element_by_id('username')
username.send_keys('kbradons04@gmail.com')
password = driver.find_element_by_id('password')

password.send_keys('Applesauce1')
password.submit()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

time.sleep(3)

elementj=(WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".subline-level-2.t-12.t-black--light.t-normal.search-result__truncate"))))
place1=[j.text for j in elementj]


elementk=WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".subline-level-1.t-14.t-black.t-normal.search-result__truncate")))
compan=[c.text for c in elementk]


element1 = driver.find_elements_by_class_name("actor-name")
title=[t.text for t in element1]


diction={"Location":place1,"Company":compan,"Title":title}
test1.append(diction)
print(test1)
Void S
  • 752
  • 4
  • 14

1 Answers1

1

I can run your code,

Here is what I get, with help from Efficient way to unnest (explode) multiple list columns in a pandas DataFrame

import time
import pandas as pd
import numpy as np
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
test1=[]
options = Options()
driver = webdriver.Chrome(ChromeDriverManager().install())

url = "https://www.linkedin.com/uas/login?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fsearch%2Fresults%2Fpeople%2F%3FcurrentCompany%3D%255B%25221252860%2522%255D%26geoUrn%3D%255B%2522103644278%2522%255D%26keywords%3Dsales%26origin%3DFACETED_SEARCH%26page%3D2&fromSignIn=true&trk=cold_join_sign_in"
driver.get(url)
time.sleep(2)

username = driver.find_element_by_id('username')
username.send_keys('kbradons04@gmail.com')
password = driver.find_element_by_id('password')

password.send_keys('Applesauce1')
password.submit()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

time.sleep(3)

elementj=(WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".subline-level-2.t-12.t-black--light.t-normal.search-result__truncate"))))
place1=[j.text for j in elementj]


elementk=WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".subline-level-1.t-14.t-black.t-normal.search-result__truncate")))
compan=[c.text for c in elementk]


element1 = driver.find_elements_by_class_name("actor-name")
title=[t.text for t in element1]


diction={"Location":place1,"Company":compan,"Title":title}
test1.append(diction)
print(test1)

df = pd.DataFrame(test1)

def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

explode(df,['Location','Company','Title'])

And the result

    Location            Company                                 Title
0   Dayton, Ohio Area   National Account Executive              LinkedIn Member
1   Dayton, Ohio Area   Currently seeking permanent employment  LinkedIn Member
2   Dayton, Ohio Area   Account Manager at LexisNexis           LinkedIn Member
3   Greater Denver Area Currently seeking new opportunities in managem...   LinkedIn Member
4   Dayton, Ohio Area   Advertising Sales Representative at AMOS MEDIA  LinkedIn Member
5   Dayton, Ohio Area   Territory Manager at Huntington Outdoor, LLC    LinkedIn Member
6   Vandalia, Ohio, United States   Cintas  LinkedIn Member
7   Dayton, Ohio Area   Outside Sales Representative at Carter Lumber.  LinkedIn Member
8   Dayton, Ohio Area   Actively Searching  LinkedIn Member
9   Corpus Christi, Texas Area  Currently looking for sales position    LinkedIn Member
Paul Brennan
  • 2,638
  • 4
  • 19
  • 26