1

The problem with this code is I am getting the text which I want with a screenshot like Home, about, menu, where all of these with crop out screenshots, but this code is showing a black screenshot on buttons and the form and some other action. I need the Form image and the button images with crop out and other screenshots that is blacked out. But can't seem to find the problem where is the mistake and why is it giving black screenshot only on buttons and forms.

import boto3, os
from botocore.exceptions import ClientError
from io import BytesIO

from PIL import Image
import time
import os, glob
from bs4 import BeautifulSoup
from IPython.display import clear_output
import json
from shutil import copyfile
import numpy as np

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import uuid

import json
from urllib.request import Request, urlopen
import requests
import re
import itertools
from bs4 import BeautifulSoup
#import openai
from multiprocessing.pool import ThreadPool




def xpath_soup(element):
    # type: (typing.Union[bs4.element.Tag, bs4.element.NavigableString]) -> str
    """
    Generate xpath from BeautifulSoup4 element.
    :param element: BeautifulSoup4 element.
    :type element: bs4.element.Tag or bs4.element.NavigableString
    :return: xpath as string
    :rtype: str
    Usage
    -----
    >>> import bs4
    >>> html = (
    ...     '<html><head><title>title</title></head>'
    ...     '<body><p>p <i>1</i></p><p>p <i>2</i></p></body></html>'
    ...     )
    >>> soup = bs4.BeautifulSoup(html, 'html.parser')
    >>> xpath_soup(soup.html.body.p.i)
    '/html/body/p[1]/i'
    >>> import bs4
    >>> xml = '<doc><elm/><elm/></doc>'
    >>> soup = bs4.BeautifulSoup(xml, 'lxml-xml')
    >>> xpath_soup(soup.doc.elm.next_sibling)
    '/doc/elm[2]'
    """
    components = []
    child = element if element.name else element.parent
    for parent in child.parents:  # type: bs4.element.Tag
        siblings = parent.find_all(child.name, recursive=False)
        components.append(
            child.name if 1 == len(siblings) else '%s[%d]' % (
                child.name,
                next(i for i, s in enumerate(siblings, 1) if s is child)
            )
        )
        child = parent
    components.reverse()
    return '/%s' % '/'.join(components)


def save_screenshot(driver: webdriver.Chrome, path: str = './screenshot.png') -> None:
    # Ref: https://stackoverflow.com/a/52572919/
    original_size = driver.get_window_size()
    required_width = driver.execute_script('return document.body.parentNode.scrollWidth')
    required_height = driver.execute_script('return document.body.parentNode.scrollHeight')
    driver.set_window_size(required_width, required_height)
    png = driver.get_screenshot_as_png()
    driver.save_screenshot(path)  # has scrollbar
    # driver.find_element_by_tag_name('body').screenshot(path)  # avoids scrollbar
    # driver.set_window_size(original_size['width'], original_size['height'])
    return driver, png


def get_images(driver, elements, im, verbose=0):
    new_elements = []
    for idx in range(len(elements)):
        xpath = elements[idx]['xpath']
        text = elements[idx]['originalText']

        try:
            element = driver.find_element_by_xpath(xpath)
            if element.is_displayed():
                location = element.location
                size = element.size

                cropped_im = im.crop(
                    (
                        location['x'], location['y'],
                        location['x'] + size['width'], location['y'] + size['height'])

                )
            
            
                if verbose: print('xpath: ', xpath)
                if verbose: print('text: ', text)
                # elements[idx]['image'] = cropped_im
                _, image_url = pil_to_s3(cropped_im)
                elements[idx]['jpeg_path'] = image_url
                # new_elements[idx]['image'] = cropped_im
                if verbose: display(cropped_im)
                if verbose: print()
                if verbose: print()
                if verbose: print()

                new_elements.append(elements[idx])
            else:
                if verbose: print(f"element {xpath} is not displayed")
                pass
        except:
            pass
    return new_elements


def pil_to_s3(pil_image):
    bucket_name = 'ekkelai-kevins'
    key = str(uuid.uuid4()) + '.jpg'

    pil_image = pil_image.convert('RGB')
    buffer = BytesIO()
    pil_image.save(buffer, "JPEG")
    buffer.seek(0)  # rewind pointer back to start
    response = client.put_object(
        ACL='public-read',
        Bucket=bucket_name,
        Key=key,
        Body=buffer,
        ContentType='image/jpeg',
    )
    image_url = "https://{0}.s3.amazonaws.com/{1}".format(bucket_name, key)
    return response, image_url


# text = "Making AI Simple For You."

def lambda_handler(event_jsonified, context):
    outputs = {}
    try:
        event = json.loads(event_jsonified['body'])
        url = event['url']
        tag_types = ['h1', 'h2']
        req = Request(url)  # headers={'User-Agent': 'Mozilla/5.0'}
        html = urlopen(req).read()
        soup = BeautifulSoup(html, 'html')  # lxml

        outputs['gpt3_text'] = get_options([tag_types], soup)
        outputs['conversion_elements'] = [
            {'xpath': xpath_soup(element), 'tagType': element.name, 'originalText': element.text} for element in
            soup.find_all(['input'])]

        return {
            'statusCode': 200,
            'headers': {
                'Access-Control-Allow-Headers': 'Content-Type',
                'Access-Control-Allow-Origin': '*',
                'Access-Control-Allow-Methods': '*'
            },
            'body': json.dumps(outputs),
        }

    except Exception as e:
        outputs['error'] = str(e)
        # outputs['error'] = traceback.format_exc()
        return {
            'statusCode': 200,
            'headers': {
                'Access-Control-Allow-Headers': 'Content-Type',
                'Access-Control-Allow-Origin': '*',
                'Access-Control-Allow-Methods': '*'
            },
            'body': json.dumps(outputs)
        }


#########################################################################
def create_phrases(text):
    openai.api_key = ""
    response = openai.Completion.create(
        engine="davinci-instruct-beta",
        prompt=text,
        temperature=0.65,
        max_tokens=60,
        top_p=1,
        frequency_penalty=0.2,
        presence_penalty=0.0,
        stop=["\n\n"]
    )
    return response['choices'][0]['text']


# ========================================================================
def get_phrases(text):
    # outputs = []
    # while len(outputs) < 3:
    #     inp_text = f'Rephrase the following in three ways:\n""""""\n{text}\n""""""\n1.'
    #     out = create_phrases(inp_text)
    #     out = out.strip().replace('"', '').replace('\n\n', '\n')
    #     outputs = out.split('\n')
    # outputs = outputs[0:3]

    # '''i am removing digits from selected three outputs'''
    # for i in range(len(outputs)):
    #     digits = re.findall(r'[0-9]+\.', outputs[i])
    #     if len(digits) > 0:
    #         for j in range(len(digits)):
    #             outputs[i] = outputs[i].replace(digits[j], '')
    outputs = ['Option 1', 'Option 2', 'Option 3']
    return outputs


#########################################################################

def get_options(tag_types, soup):
    data = ''
    for tag_type in tag_types:
        all_elements = soup.find_all(tag_type)
        print(f"total elements: {len(all_elements)}")
        print(f"elements for tag type: {tag_type}: {len(all_elements)}")
        if len(all_elements)<1:
            continue
        pool = ThreadPool(len(all_elements))

        all_txt_elements, all_xpaths, all_tag_types = [], [], []
        for element in all_elements:
            text = element.getText()
            tag_type = element.name

            if text != None and text != '':
                xpath = xpath_soup(element)
            else:
                continue

            all_txt_elements.append(text)
            all_xpaths.append(xpath)
            all_tag_types.append(tag_type)

        # print('lengths: ', len(all_txt_elements), len(all_xpaths), len(all_tag_types))
        data = pool.starmap(process_element, zip(all_txt_elements, all_xpaths, all_tag_types))
        pool.close()
    return data


def xpath_soup(element):
    # type: (typing.Union[bs4.element.Tag, bs4.element.NavigableString]) -> str
    """
    Generate xpath from BeautifulSoup4 element.
    :param element: BeautifulSoup4 element.
    :type element: bs4.element.Tag or bs4.element.NavigableString
    :return: xpath as string
    :rtype: str
    Usage
    -----
    >>> import bs4
    >>> html = (
    ...     '<html><head><title>title</title></head>'
    ...     '<body><p>p <i>1</i></p><p>p <i>2</i></p></body></html>'
    ...     )
    >>> soup = bs4.BeautifulSoup(html, 'html.parser')
    >>> xpath_soup(soup.html.body.p.i)
    '/html/body/p[1]/i'
    >>> import bs4
    >>> xml = '<doc><elm/><elm/></doc>'
    >>> soup = bs4.BeautifulSoup(xml, 'lxml-xml')
    >>> xpath_soup(soup.doc.elm.next_sibling)
    '/doc/elm[2]'
    """
    components = []
    child = element if element.name else element.parent
    for parent in child.parents:  # type: bs4.element.Tag
        siblings = parent.find_all(child.name, recursive=False)
        components.append(
            child.name if 1 == len(siblings) else '%s[%d]' % (
                child.name,
                next(i for i, s in enumerate(siblings, 1) if s is child)
            )
        )
        child = parent
    components.reverse()
    return '/%s' % '/'.join(components)


def process_element(text, xpath, tag_type):
    options = get_phrases(text)

    return {
        'xpath': xpath,
        'tagType': tag_type,
        'originalText': text,
        'options': options
    }
tag_types = ['h1', 'h2']
outputs = {}
# json_data = request.get_json()
# url = json_data['url']
url = "http://stevens.ekkel.ai"
print("*"*100)
print(url)
print("*"*100)

from selenium.webdriver.chrome.options import Options
# url = "https://www.tourism.net.nz/attractions-and-activities#tab-categories"
chrome_options = Options()
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--disable-gpu')
#chrome_options.add_argument('--window-size=1280x1696')
#chrome_options.add_argument('--user-data-dir=/tmp/user-data')
#chrome_options.add_argument('--hide-scrollbars')
#chrome_options.add_argument('--enable-logging')
#chrome_options.add_argument('--log-level=0')
#chrome_options.add_argument('--v=99')
#chrome_options.add_argument('--single-process')
#chrome_options.add_argument('--data-path=/tmp/data-path')
#chrome_options.add_argument('--ignore-certificate-errors')
#chrome_options.add_argument('--homedir=/tmp')
#chrome_options.add_argument('--disk-cache-dir=/tmp/cache-dir')
#chrome_options.add_argument('user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36')

# chrome_options.add_argument('--start-maximized')
driver = webdriver.Chrome(executable_path=r'./chromedriver.exe', chrome_options=chrome_options)
driver.get(url)
time.sleep(2)
html = driver.page_source
soup = BeautifulSoup(html, 'html')  # lxml
# print(soup)
# raise Exception
# get eligible elements
outputs['gpt3_text'] = get_options([tag_types], soup)
outputs['conversion_elements'] = [
    {'xpath': xpath_soup(element), 'tagType': element.name, 'originalText': element.text} for element in
    soup.find_all(['a', 'button', 'form'])]

# get full screenshot
driver, png = save_screenshot(driver, path='./screenshot.png')
im = Image.open(BytesIO(png))  # uses PIL library to open image in memory

# get and save element screenshots
outputs['conversion_elements'] = get_images(driver, outputs['conversion_elements'], im)
outputs['gpt3_text'] = get_images(driver, outputs['gpt3_text'], im)

# quit driver and cleanup
driver.quit()
print("done")

Sonali Das
  • 943
  • 1
  • 7
  • 24
Khasif
  • 69
  • 1
  • 10

0 Answers0