0

For the following piece of code:

#################################
# Simple-Instagram-Scraper v1.0.3
# Release: 10.11.2020
# GitHub: do-me
#################################

from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
import re
from urllib.request import urlopen
import json
from pandas.io.json import json_normalize
import pandas as pd, numpy as np
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import random 
import string
from time import gmtime, strftime
import os

option = webdriver.ChromeOptions()
chrome_prefs = {}
option.experimental_options["prefs"] = chrome_prefs
chrome_prefs["profile.default_content_settings"] = {"images": 2}
chrome_prefs["profile.managed_default_content_settings"] = {"images": 2}

# Parameters

#---------------------------------------------------------------

# CRUCIAL PARAMS

# directory to chromedriver or geckodriver, find chromedriver here: https://chromedriver.chromium.org/
browser = webdriver.Chrome("yourpath/chromedriver.exe", options=option)

# user credentials
username = "username"
userpassword = "userpassword"

# which page?
pagetoscrape = "https://www.instagram.com/explore/locations/118546/thessaloniki/" # either hashtag, location id or user account possible
# pagetoscrape = "https://www.instagram.com/explore/tags/berlin/"

#---------------------------------------------------------------

# OPTIONAL PARAMS

# maximum posts to scrape
maxiter = 10000

# filename for output csv
out_csv = "-".join(pagetoscrape.split("/")[-3:])[:-1] + ".csv" # i.e.'118546-thessaloniki' for locations or tags-berlin for hashtags

# quite crucial but subject to trial and error due to unknown Instagram blocking policy: breaks
# set a random break duration for every iteration after opening one post and before going to the next one
short_pauseduration_min = 2 # seconds 
short_pauseduration_max = 2.7 # seconds

# set a random break duration for longer breaks...
long_pauseduration_min = 4.8 # seconds 
long_pauseduration_max = 10.5 # seconds

# ...for the following random iterations (number of iterations = index of scraped posts)
pauselist = random.sample(range(10, 10000), 400) # between 10 and 10000 generate list of 500 values # randomList.sort() for sorting
pauselist.append([x + int(random.uniform(1,10)) for x in np.arange(20, 10000, 50).tolist()]) # just to make sure: add list with values every 51th to 60th iteration  

#---------------------------------------------------------------

# ADVANCED PARAMS

# uncomment for saving in a folder called "data" you created before manually
# if os.getcwd().split("\\")[-1] != "data": # if you changed working dir already for some reason
#  os.chdir(r"data") 

I get the following error:

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
     75                                             stderr=self.log_file,
---> 76                                             stdin=PIPE)
     77         except TypeError:

~/anaconda3/lib/python3.7/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
    799                                 errread, errwrite,
--> 800                                 restore_signals, start_new_session)
    801         except:

~/anaconda3/lib/python3.7/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
   1550                             err_msg += ': ' + repr(err_filename)
-> 1551                     raise child_exception_type(errno_num, err_msg, err_filename)
   1552                 raise child_exception_type(err_msg)

FileNotFoundError: [Errno 2] No such file or directory: 'yourpath/chromedriver.exe': 'yourpath/chromedriver.exe'

During handling of the above exception, another exception occurred:

WebDriverException                        Traceback (most recent call last)
<ipython-input-3-943542c00cca> in <module>
     34 
     35 # directory to chromedriver or geckodriver, find chromedriver here: https://chromedriver.chromium.org/
---> 36 browser = webdriver.Chrome("yourpath/chromedriver.exe", options=option)
     37 
     38 # user credentials

~/anaconda3/lib/python3.7/site-packages/selenium/webdriver/chrome/webdriver.py in __init__(self, executable_path, port, options, service_args, desired_capabilities, service_log_path, chrome_options, keep_alive)
     71             service_args=service_args,
     72             log_path=service_log_path)
---> 73         self.service.start()
     74 
     75         try:

~/anaconda3/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
     81                 raise WebDriverException(
     82                     "'%s' executable needs to be in PATH. %s" % (
---> 83                         os.path.basename(self.path), self.start_error_message)
     84                 )
     85             elif err.errno == errno.EACCES:

WebDriverException: Message: 'chromedriver.exe' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home

Two questions here:

  1. how do I force Jupyter notebook to use Chrome instead of Firefox?
  2. how do I fix this error in Ubuntu 20.04?
Mona Jalal
  • 34,860
  • 64
  • 239
  • 408

1 Answers1

2

Your first question: How to change the default browser used by the ipython/jupyter notebook in Linux?

For 2:

It doesn't need to be on your path. If you notice, it's actually telling you that it didn't find a chromedriver at the path you provided because there was no file there. Give it a working path and you'll be fine.

M Z
  • 4,571
  • 2
  • 13
  • 27
  • thank you this is where the executable is ~/Downloads/chromedriver_linux64.zip/chromedriver/chromedriver_linux64/chromedriver not sure what to do with it. could you please expand your answer for 2. and help me figure what to do with this? – Mona Jalal Nov 30 '20 at 02:46
  • ~ won't work as a path string in this case. Either give an absolute (starting with /) or a relative path from your current directory – M Z Nov 30 '20 at 02:48
  • How should I do this? could you please be more specific? `Either give an absolute (starting with /) or a relative path from your current directory` – Mona Jalal Nov 30 '20 at 02:50
  • Here's an example. If you're on windows, an absolute path would be something like `"C:/Users/username/Downloads/..../chromedriver.exe"`. On Linux, it'd look something like `"/usr/Downloads/..../chromedriver"`. I'm not sure how much else I could help with, you can find – M Z Nov 30 '20 at 02:54
  • well it is in the path and still doesn't work! `$ echo $PATH /home/mona/anaconda3/bin:/home/mona/anaconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:~/Downloads/chromedriver_linux64.zip/chromedriver/chromedriver_linux64` – Mona Jalal Nov 30 '20 at 02:58
  • you're doing several things here that I don't understand. 1) you have a link going into a zip folder. How? 2) It's still not a complete path. I meant path, not PATH. Notice how everything else starts in your path starts with `/` and your chromedriver starts with `~` – M Z Nov 30 '20 at 04:16
  • 1
    You're also passing it a path as an argument in your program, so adding it to your system PATH will get overwritten – M Z Nov 30 '20 at 04:17
  • browser = webdriver.Chrome("/home/mona/Downloads/chromedriver_linux64/chromedriver", options=option) fixed the error thanks – Mona Jalal Nov 30 '20 at 04:21