To scrape Google (or some else site) you need to use node.js (if you want to write on javascript). There are three popular ways to do this:
- using HTTP request + parse HTML;
- using browser automation;
- using ready-made API.
First solution (axios + cheerio). It's fast and simple but it can't get dynamic content (that builds with JS) from the page and can be blocked by site protection (read more in Reducing the chance of being blocked while web scraping blog post):
const cheerio = require("cheerio");
const axios = require("axios");
const searchString = "some search query";
const AXIOS_OPTIONS = {
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
},
params: { q: `${searchString}`, hl: "en", gl: "us" },
};
async function getLinks() {
return axios.get(`http://www.google.com/search`, AXIOS_OPTIONS).then(async function ({ data }) {
let $ = cheerio.load(data);
const links = Array.from($(".yuRUbf > a")).map((el) => $(el).attr("href"));
return links;
});
}
getLinks();
Second solution (puppeteer). Gives you more freedom, and it can do on the page whatever humans can do, but it's slow and difficult to use:
const puppeteer = require("puppeteer");
const serchQuery = "some search query";
const searchParams = {
query: encodeURI(serchQuery),
hl: "en",
gl: "us",
};
const URL = `http://www.google.com/search?q=${searchParams.query}&hl=${searchParams.hl}&gl=${searchParams.gl}`;
async function getLinks() {
const browser = await puppeteer.launch({
headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.goto(URL);
const links = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".yuRUbf > a")).map((el) => el.getAttribute("href"));
});
await browser.close();
return links;
}
getLinks();
Third solution (serpapi). The main advantage it's no need to choose the necessary CSS selectors from the page and no need to maintain your scraper (when selectors are changed over time). Also, it's fast like a simple HTTP request, but it supports not all websites:
import { getJson } from "serpapi";
const getLinks = async () => {
const response = await getJson("google", {
api_key: API_KEY, // Get your API_KEY from https://serpapi.com/manage-api-key
q: "some search query",
});
const links = response.organic_results.map((el) => el.link);
return links;
};
getLinks();