1

I am trying to download images using this code. However, The website comes up with a captcha. When I try to select the captcha it displays a broken PC image. Cannot figure a way past this. Is their a way to avoid the captcha altogether? Or select it somehow for clicking options via selenium. It's a long code but an MRE.

from selenium import webdriver
from bs4 import BeautifulSoup as soup
from datetime import date ,timedelta
import requests
import time
import base64
import cv2
import pytesseract
import xlsxwriter
import numpy as np
import pandas as pd
import os
import shutil

driver = webdriver.Chrome("chromedriver")
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
excel_name = ['Сите','Упис на основање','2б Ново со спојување на __','3б Ново со раздвојување од __','5б Ново со издвојување  од__','Упис на промена','Документ за корекција','1б  Присоединување во __','4б Превземање со раздвојување на __','5а Издвојување во ново__','6б Превземање од__','6а Издвојување во __','Документ за регистрирање на работно време','Документ за определување на главна приходна шифра и големина','Документ за евидентирање на казна/санкција','Документ за евидентирање на бришење на казна/санкција','Документ за евидентирање на стечај на друг субјект','Документ за евидентирање на заклучување на стечај на друг субјект','Упис на бришење','2а Спојување во ново__со бришење','4а Раздвојување со превземање во __ со бришење','3а Раздвојување на ново __ со бришење','1а Присоединување на __ со бришење','Судска Процедура - Стечај','Ликвидација','Претстечај (Претходна постапка)','Објава(Друго)','Објава(Стечајна постапка)','Објава(Ликвидациона постапка)','Вонсудска спогодба','Објава(Вонсудска спогодба)','Предбелешка']
#excel_name = ['Сите','Упис на  основање','2б Ново со спојување на __','Упис на промена']
image_name = ['image', 'image0', 'image1', 'image2', 'image3', 'image4', 'image5', 'image6', 'image7', 'image8', 'image9', 'image10', 'image11', 'image12', 'image13', 'image14', 'image15', 'image16', 'image17', 'image18', 'image19', 'image20', 'image21', 'image22', 'image23', 'image24', 'image25', 'image26', 'image27', 'image28', 'image29', 'image30']


def get_text(data_number, image_name, excel_name):
  workbook = xlsxwriter.Workbook(str(date.today() - timedelta(days=1)) + '-' + excel_name + '.xlsx')
  worksheet = workbook.add_worksheet("content")
  row = 0
  print(image_name, data_number)
  # Load image, grayscale, and Otsu's threshold
  for i in range(data_number):
    print('./images/' + str(date.today() - timedelta(days=3)) + '-' + image_name + str(i) + '.png')
    image = cv2.imread('./images/' + str(date.today() - timedelta(days=3)) + '-' + image_name + str(i) + '.png')
    try:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    except:
        continue

    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    # Remove horizontal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
    detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    cnts = cv2.findContours(detect_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
      cv2.drawContours(thresh, [c], -1, (0, 0, 0), 2)

    # Remove vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 15))
    detect_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts = cv2.findContours(detect_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
      cv2.drawContours(thresh, [c], -1, (0, 0, 0), 3)

    # Dilate to connect text and remove dots
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 1))
    dilate = cv2.dilate(thresh, kernel, iterations=2)
    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
      area = cv2.contourArea(c)
      if area < 500:
        cv2.drawContours(dilate, [c], -1, (0, 0, 0), -1)

    # Bitwise-and to reconstruct image
    result = cv2.bitwise_and(image, image, mask=dilate)
    result[dilate == 0] = (255, 255, 255)

    # OCR
    data = pytesseract.image_to_string(result, lang='mkd+eng', config='--psm 6')
    # data = pytesseract.image_to_string(result,config='--psm 6')
    #print(data)
    worksheet.write(row, 0, data)
    row = row + 1

  workbook.close()

def sort_contours(cnts, method="left-to-right"):
  # initialize the reverse flag and sort index
  reverse = False
  i = 0
  # handle if we need to sort in reverse
  if method == "right-to-left" or method == "bottom-to-top":
    reverse = True
  # handle if we are sorting against the y-coordinate rather than
  # the x-coordinate of the bounding box
  if method == "top-to-bottom" or method == "bottom-to-top":
    i = 1
  # construct the list of bounding boxes and sort them from top to
  # bottom
  boundingBoxes = [cv2.boundingRect(c) for c in cnts]
  (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
                                      key=lambda b: b[1][i], reverse=reverse))
  # return the list of sorted contours and bounding boxes
  return (cnts, boundingBoxes)


def get_table(path):
  image = cv2.imread(path, 0)
  image_colour=cv2.imread(path)
  ret, img = cv2.threshold(image, 240, 255, cv2.THRESH_BINARY)
  img_inv = 255 - img
  kernel_len = np.array(img).shape[1] // 100
  # Defining a vertical kernel to detect all vertical lines of image
  ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
  img_bin = img_inv
  image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
  vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)
  cv2.imwrite("vertical.jpg", vertical_lines)
  hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
  # A kernel of 2x2
  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
  image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
  horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)
  cv2.imwrite("horizontal.jpg", horizontal_lines)
  img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
  # Eroding and thesholding the image
  img_vh = cv2.erode(~img_vh, kernel, iterations=2)
  thresh, img_vh = cv2.threshold(img_vh, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
  cv2.imwrite("img_vh.jpg", img_vh)
  #bitxor = cv2.bitwise_xor(img, img_vh)
  #bitnot = cv2.bitwise_not(bitxor)
  contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

  # Sort all the contours by top to bottom.
  contours, boundingBoxes = sort_contours(contours, method="top-to-bottom")
  # Creating a list of heights for all detected boxes
  heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]
  # Get mean of heights
  mean = np.mean(heights)
  box = []
  # Get position (x,y), width and height for every contour and show the contour on image
  for c in contours:
    x, y, w, h = cv2.boundingRect(c)
    if (100 <w < 0.8*image.shape[1] and 40 < h < 500):
      image = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
      box.append([x, y, w, h])
  return arrange_rows(box,mean),image_colour


def arrange_rows(box,mean):
  row = []
  column = []
  j = 0
  # Sorting the boxes to their respective row and column
  for i in range(len(box)):
    if (i == 0):
      column.append(box[i])
      previous = box[i]
    else:
      if (box[i][1] <= previous[1] + mean / 2):
        column.append(box[i])
        previous = box[i]
        if (i == len(box) - 1):
          row.append(column)
      else:
        row.append(column)
        column = []
        previous = box[i]
        column.append(box[i])
  return row

def cell_ocr(im,rcts):
  rcts = [sorted(c, key=lambda x: x[0]) for c in rcts]
  output = []
  for i, row in enumerate(rcts):
    y, x, w, h = row[0]
    y1, x1, w1, h1 = row[1]
    finalimg = im[x:x + h, y:y + w]
    finalimg_val = im[x1:x1 + h1, y1:y1 + w1]
    resizing = cv2.resize(finalimg, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    resizing_val = cv2.resize(finalimg_val, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    out = pytesseract.image_to_string(resizing, lang='mkd+eng')
    out_val = pytesseract.image_to_string(resizing_val, lang='mkd+eng')
    output.append([out.strip(), out_val.strip()])
  return output

def get_text_v(path="images", date_of=(date.today() - timedelta(days=1))):
  type_dict = {}
  for f in os.listdir(path):
    print("Processing File : " + str(f) + " ...")
    r, im = get_table(os.path.join(path,f))
    output=cell_ocr(im,r)
    try:
      idx=[x[0] for x in output].index("Вид на упис")
      attr_key = output[idx][1]
    except ValueError:
      attr_key = "custom"
    if attr_key in type_dict:
      grp_df=pd.DataFrame(output).groupby(0,as_index=False).agg(lambda x: ",".join([str(xc) for xc in x]))
      type_dict[attr_key]=type_dict[attr_key].merge(grp_df, how="outer",on=0)
    else:
      type_dict[attr_key]=pd.DataFrame(output).groupby(0,as_index=False).agg(lambda x: ",".join([str(xc) for xc in x]))
    type_dict.pop('Упис на промена', None) # this should delete the Упис на промена sheet
    type_dict.pop('Упис на основање', None) # this should delete the Упис на основање sheet
    type_dict.pop('Упис на  основање', None) # this should delete the Упис на основање sheet
  with pd.ExcelWriter("workbook"+str(date_of)+'.xlsx') as writer:
    for k, v in type_dict.items():
      v.transpose().to_excel(writer, sheet_name=k[:30], header=False, index=False)
  return type_dict


def main():
  count = 0
  driver.get("http://crm.com.mk/mk/otvoreni-podatotsi/objavi-na-upisi-za-subjekti")
  time.sleep(30)
  for l in range(len(excel_name)):
    print("visiting option : " + excel_name[l])
    data_list = []
    if (l < 1):
      continue
    today = str(date.today() - timedelta(days=3)).split('-')
    get_date = today[2] + '.' + today[1] + '.' + today[0]
    driver.find_element_by_xpath(
      '//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[2]/div/div[1]/div[2]/div[1]/fieldset/span/select/option[' + str(
        l + 1) + ']').click()
    time.sleep(2)
    driver.find_element_by_xpath(
      '//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[2]/div/div[1]/div[2]/div[2]/fieldset/input').send_keys(
      get_date)
    time.sleep(2)
    driver.find_element_by_xpath(
      '//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[2]/div/div[2]/div/button[1]').click()
    time.sleep(10)
    page_content = soup(driver.page_source, 'html.parser')

    if (page_content.find('table', {'class': 'table--mobile'}) != None):
      if (page_content.find('ul', {'class': 'ngx-pagination'}) != None):
        page_list = page_content.find('ul', {'class': 'ngx-pagination'}).findAll("li")
        print(page_list[len(page_list) - 2].text.replace('page ', ''))

        for i in range(int(page_list[len(page_list) - 2].text.replace('page ', ''))):
          time.sleep(3)
          driver.find_element_by_xpath(
            '//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[4]/div/div/pagination-controls/pagination-template/ul/li[' + str(
              i + 3) + ']').click()

          time.sleep(3)
          page_res = soup(driver.page_source, 'html.parser')

          if (page_res.find('table', {'class': 'table--mobile'}) != None):
            table_list = page_res.find('table', {'class': 'table--mobile'}).findAll('tr')
            for j in range(len(table_list)):
              if (j > 0):
                tr_list = table_list[j].findAll('td')
                data_list.append(tr_list[0].text)
          else:
            count = 1
          if count == 1:
            break
      else:
        table_list = page_content.find('table', {'class': 'table--mobile'}).findAll('tr')
        for j in range(len(table_list)):
          if (j > 0):
            tr_list = table_list[j].findAll('td')
            data_list.append(tr_list[0].text)

    print("number of items found in option " + excel_name[l] + " : " + str(len(data_list)))
    data_number = len(data_list)
    if (data_number == 0):
      driver.find_element_by_xpath(
        '//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[2]/div/div[1]/div[2]/div[2]/fieldset/input').clear()
      continue
    for k in range(len(data_list)):
      print("Downloading image number : " + str(k) + "/" + str(len(data_list)))
      #if(k>2):
      #  break
      driver.get("http://crm.com.mk/mk/otvoreni-podatotsi/objavi-na-upisi-za-subjekti?id=" + data_list[k] + "&p=1")
      time.sleep(60)
      page_cont = soup(driver.page_source, 'html.parser')
      if (page_cont.find('div', {'class': 'row center'}) != None):
        image_src = page_cont.find('div', {'class': 'row center'}).div.img['src']
        
        try:
            imagedata = base64.b64decode(image_src.replace('data:image/png;base64,', ''))
            image = open("./images/" + str(date.today() - timedelta(days=)) + '-' + image_name[l] + str(k) + ".png", "wb")
            image.write(imagedata)
            image.close()
        except:
            print("An exception occurred on image " + str(k) +" with id : " + str(data_list[k]) )
          

    driver.get("http://crm.com.mk/mk/otvoreni-podatotsi/objavi-na-upisi-za-subjekti")
    time.sleep(20)
    if excel_name[l]=="Упис на промена":
      get_text(data_number, image_name[l], excel_name[l])
    if excel_name[l]=="Упис на основање":
      get_text(data_number, image_name[l], excel_name[l])
    count = 0
  driver.close()

main()
print("Generating workbook please wait ...")
get_text_v()
print("Workbook file generated !!")

print("Moving files from images to oldimages ...")

source_dir = 'images'
target_dir = 'oldimages'
    
file_names = os.listdir(source_dir)
    
for file_name in file_names:
    print("moving file " + str(file_name) + " ...")
    try:
      shutil.move(os.path.join(source_dir, file_name), target_dir)
    except:
      print("An exception occurred, File already exist !!!")
print("Moving files from images to oldimages Done !!!")
Abhishek Rai
  • 2,159
  • 3
  • 18
  • 38
  • Captcha was introduced to avoid bots and accessing websites programmatically. Thus if it were that easy to avoid, wouldn't that make it pointless and redundant? Regardless, take a look here: [How to bypass Google captcha with Selenium and Python?](https://stackoverflow.com/questions/58872451/how-to-bypass-google-captcha-with-selenium-and-python) – PApostol Nov 30 '20 at 16:06
  • @PApostol I know. I was hoping to select the captcha using selenium and solve it..but it crashes...that's the issue...any guesses? – Abhishek Rai Nov 30 '20 at 16:12
  • If it crashes it might be useful to include the stack trace with your post, or even reduce the code to only reproduce the problem without any extra parts if possible. However there are several pointers on Stack Overflow you could try, like [Dealing with reCAPTCHA in Python Selenium](https://stackoverflow.com/questions/59403852/dealing-with-recaptcha-in-python-selenium) – PApostol Nov 30 '20 at 17:01
  • Well, I was able to click it once when it didn't crash with Firefox..however, it still doesn't load the next page...So, this question has no solution as of now. – Abhishek Rai Nov 30 '20 at 20:05

0 Answers0