I am trying to download images using this code. However, The website comes up with a captcha. When I try to select the captcha it displays a broken PC image. Cannot figure a way past this. Is their a way to avoid the captcha altogether? Or select it somehow for clicking options via selenium. It's a long code but an MRE.
from selenium import webdriver
from bs4 import BeautifulSoup as soup
from datetime import date ,timedelta
import requests
import time
import base64
import cv2
import pytesseract
import xlsxwriter
import numpy as np
import pandas as pd
import os
import shutil
driver = webdriver.Chrome("chromedriver")
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
excel_name = ['Сите','Упис на основање','2б Ново со спојување на __','3б Ново со раздвојување од __','5б Ново со издвојување од__','Упис на промена','Документ за корекција','1б Присоединување во __','4б Превземање со раздвојување на __','5а Издвојување во ново__','6б Превземање од__','6а Издвојување во __','Документ за регистрирање на работно време','Документ за определување на главна приходна шифра и големина','Документ за евидентирање на казна/санкција','Документ за евидентирање на бришење на казна/санкција','Документ за евидентирање на стечај на друг субјект','Документ за евидентирање на заклучување на стечај на друг субјект','Упис на бришење','2а Спојување во ново__со бришење','4а Раздвојување со превземање во __ со бришење','3а Раздвојување на ново __ со бришење','1а Присоединување на __ со бришење','Судска Процедура - Стечај','Ликвидација','Претстечај (Претходна постапка)','Објава(Друго)','Објава(Стечајна постапка)','Објава(Ликвидациона постапка)','Вонсудска спогодба','Објава(Вонсудска спогодба)','Предбелешка']
#excel_name = ['Сите','Упис на основање','2б Ново со спојување на __','Упис на промена']
image_name = ['image', 'image0', 'image1', 'image2', 'image3', 'image4', 'image5', 'image6', 'image7', 'image8', 'image9', 'image10', 'image11', 'image12', 'image13', 'image14', 'image15', 'image16', 'image17', 'image18', 'image19', 'image20', 'image21', 'image22', 'image23', 'image24', 'image25', 'image26', 'image27', 'image28', 'image29', 'image30']
def get_text(data_number, image_name, excel_name):
workbook = xlsxwriter.Workbook(str(date.today() - timedelta(days=1)) + '-' + excel_name + '.xlsx')
worksheet = workbook.add_worksheet("content")
row = 0
print(image_name, data_number)
# Load image, grayscale, and Otsu's threshold
for i in range(data_number):
print('./images/' + str(date.today() - timedelta(days=3)) + '-' + image_name + str(i) + '.png')
image = cv2.imread('./images/' + str(date.today() - timedelta(days=3)) + '-' + image_name + str(i) + '.png')
try:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
except:
continue
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Remove horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(detect_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(thresh, [c], -1, (0, 0, 0), 2)
# Remove vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 15))
detect_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(detect_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(thresh, [c], -1, (0, 0, 0), 3)
# Dilate to connect text and remove dots
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 1))
dilate = cv2.dilate(thresh, kernel, iterations=2)
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
area = cv2.contourArea(c)
if area < 500:
cv2.drawContours(dilate, [c], -1, (0, 0, 0), -1)
# Bitwise-and to reconstruct image
result = cv2.bitwise_and(image, image, mask=dilate)
result[dilate == 0] = (255, 255, 255)
# OCR
data = pytesseract.image_to_string(result, lang='mkd+eng', config='--psm 6')
# data = pytesseract.image_to_string(result,config='--psm 6')
#print(data)
worksheet.write(row, 0, data)
row = row + 1
workbook.close()
def sort_contours(cnts, method="left-to-right"):
# initialize the reverse flag and sort index
reverse = False
i = 0
# handle if we need to sort in reverse
if method == "right-to-left" or method == "bottom-to-top":
reverse = True
# handle if we are sorting against the y-coordinate rather than
# the x-coordinate of the bounding box
if method == "top-to-bottom" or method == "bottom-to-top":
i = 1
# construct the list of bounding boxes and sort them from top to
# bottom
boundingBoxes = [cv2.boundingRect(c) for c in cnts]
(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
key=lambda b: b[1][i], reverse=reverse))
# return the list of sorted contours and bounding boxes
return (cnts, boundingBoxes)
def get_table(path):
image = cv2.imread(path, 0)
image_colour=cv2.imread(path)
ret, img = cv2.threshold(image, 240, 255, cv2.THRESH_BINARY)
img_inv = 255 - img
kernel_len = np.array(img).shape[1] // 100
# Defining a vertical kernel to detect all vertical lines of image
ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
img_bin = img_inv
image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)
cv2.imwrite("vertical.jpg", vertical_lines)
hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
# A kernel of 2x2
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)
cv2.imwrite("horizontal.jpg", horizontal_lines)
img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
# Eroding and thesholding the image
img_vh = cv2.erode(~img_vh, kernel, iterations=2)
thresh, img_vh = cv2.threshold(img_vh, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
cv2.imwrite("img_vh.jpg", img_vh)
#bitxor = cv2.bitwise_xor(img, img_vh)
#bitnot = cv2.bitwise_not(bitxor)
contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# Sort all the contours by top to bottom.
contours, boundingBoxes = sort_contours(contours, method="top-to-bottom")
# Creating a list of heights for all detected boxes
heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]
# Get mean of heights
mean = np.mean(heights)
box = []
# Get position (x,y), width and height for every contour and show the contour on image
for c in contours:
x, y, w, h = cv2.boundingRect(c)
if (100 <w < 0.8*image.shape[1] and 40 < h < 500):
image = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
box.append([x, y, w, h])
return arrange_rows(box,mean),image_colour
def arrange_rows(box,mean):
row = []
column = []
j = 0
# Sorting the boxes to their respective row and column
for i in range(len(box)):
if (i == 0):
column.append(box[i])
previous = box[i]
else:
if (box[i][1] <= previous[1] + mean / 2):
column.append(box[i])
previous = box[i]
if (i == len(box) - 1):
row.append(column)
else:
row.append(column)
column = []
previous = box[i]
column.append(box[i])
return row
def cell_ocr(im,rcts):
rcts = [sorted(c, key=lambda x: x[0]) for c in rcts]
output = []
for i, row in enumerate(rcts):
y, x, w, h = row[0]
y1, x1, w1, h1 = row[1]
finalimg = im[x:x + h, y:y + w]
finalimg_val = im[x1:x1 + h1, y1:y1 + w1]
resizing = cv2.resize(finalimg, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
resizing_val = cv2.resize(finalimg_val, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
out = pytesseract.image_to_string(resizing, lang='mkd+eng')
out_val = pytesseract.image_to_string(resizing_val, lang='mkd+eng')
output.append([out.strip(), out_val.strip()])
return output
def get_text_v(path="images", date_of=(date.today() - timedelta(days=1))):
type_dict = {}
for f in os.listdir(path):
print("Processing File : " + str(f) + " ...")
r, im = get_table(os.path.join(path,f))
output=cell_ocr(im,r)
try:
idx=[x[0] for x in output].index("Вид на упис")
attr_key = output[idx][1]
except ValueError:
attr_key = "custom"
if attr_key in type_dict:
grp_df=pd.DataFrame(output).groupby(0,as_index=False).agg(lambda x: ",".join([str(xc) for xc in x]))
type_dict[attr_key]=type_dict[attr_key].merge(grp_df, how="outer",on=0)
else:
type_dict[attr_key]=pd.DataFrame(output).groupby(0,as_index=False).agg(lambda x: ",".join([str(xc) for xc in x]))
type_dict.pop('Упис на промена', None) # this should delete the Упис на промена sheet
type_dict.pop('Упис на основање', None) # this should delete the Упис на основање sheet
type_dict.pop('Упис на основање', None) # this should delete the Упис на основање sheet
with pd.ExcelWriter("workbook"+str(date_of)+'.xlsx') as writer:
for k, v in type_dict.items():
v.transpose().to_excel(writer, sheet_name=k[:30], header=False, index=False)
return type_dict
def main():
count = 0
driver.get("http://crm.com.mk/mk/otvoreni-podatotsi/objavi-na-upisi-za-subjekti")
time.sleep(30)
for l in range(len(excel_name)):
print("visiting option : " + excel_name[l])
data_list = []
if (l < 1):
continue
today = str(date.today() - timedelta(days=3)).split('-')
get_date = today[2] + '.' + today[1] + '.' + today[0]
driver.find_element_by_xpath(
'//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[2]/div/div[1]/div[2]/div[1]/fieldset/span/select/option[' + str(
l + 1) + ']').click()
time.sleep(2)
driver.find_element_by_xpath(
'//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[2]/div/div[1]/div[2]/div[2]/fieldset/input').send_keys(
get_date)
time.sleep(2)
driver.find_element_by_xpath(
'//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[2]/div/div[2]/div/button[1]').click()
time.sleep(10)
page_content = soup(driver.page_source, 'html.parser')
if (page_content.find('table', {'class': 'table--mobile'}) != None):
if (page_content.find('ul', {'class': 'ngx-pagination'}) != None):
page_list = page_content.find('ul', {'class': 'ngx-pagination'}).findAll("li")
print(page_list[len(page_list) - 2].text.replace('page ', ''))
for i in range(int(page_list[len(page_list) - 2].text.replace('page ', ''))):
time.sleep(3)
driver.find_element_by_xpath(
'//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[4]/div/div/pagination-controls/pagination-template/ul/li[' + str(
i + 3) + ']').click()
time.sleep(3)
page_res = soup(driver.page_source, 'html.parser')
if (page_res.find('table', {'class': 'table--mobile'}) != None):
table_list = page_res.find('table', {'class': 'table--mobile'}).findAll('tr')
for j in range(len(table_list)):
if (j > 0):
tr_list = table_list[j].findAll('td')
data_list.append(tr_list[0].text)
else:
count = 1
if count == 1:
break
else:
table_list = page_content.find('table', {'class': 'table--mobile'}).findAll('tr')
for j in range(len(table_list)):
if (j > 0):
tr_list = table_list[j].findAll('td')
data_list.append(tr_list[0].text)
print("number of items found in option " + excel_name[l] + " : " + str(len(data_list)))
data_number = len(data_list)
if (data_number == 0):
driver.find_element_by_xpath(
'//*[@id="content"]/cms-container/crm-template-fs-latestannouncement/crm-cnt-latestannouncement/crm-cnt-latestannouncement-list/div/crm-cnt-latestannouncement-list-oss/div[2]/div/div[1]/div[2]/div[2]/fieldset/input').clear()
continue
for k in range(len(data_list)):
print("Downloading image number : " + str(k) + "/" + str(len(data_list)))
#if(k>2):
# break
driver.get("http://crm.com.mk/mk/otvoreni-podatotsi/objavi-na-upisi-za-subjekti?id=" + data_list[k] + "&p=1")
time.sleep(60)
page_cont = soup(driver.page_source, 'html.parser')
if (page_cont.find('div', {'class': 'row center'}) != None):
image_src = page_cont.find('div', {'class': 'row center'}).div.img['src']
try:
imagedata = base64.b64decode(image_src.replace('data:image/png;base64,', ''))
image = open("./images/" + str(date.today() - timedelta(days=)) + '-' + image_name[l] + str(k) + ".png", "wb")
image.write(imagedata)
image.close()
except:
print("An exception occurred on image " + str(k) +" with id : " + str(data_list[k]) )
driver.get("http://crm.com.mk/mk/otvoreni-podatotsi/objavi-na-upisi-za-subjekti")
time.sleep(20)
if excel_name[l]=="Упис на промена":
get_text(data_number, image_name[l], excel_name[l])
if excel_name[l]=="Упис на основање":
get_text(data_number, image_name[l], excel_name[l])
count = 0
driver.close()
main()
print("Generating workbook please wait ...")
get_text_v()
print("Workbook file generated !!")
print("Moving files from images to oldimages ...")
source_dir = 'images'
target_dir = 'oldimages'
file_names = os.listdir(source_dir)
for file_name in file_names:
print("moving file " + str(file_name) + " ...")
try:
shutil.move(os.path.join(source_dir, file_name), target_dir)
except:
print("An exception occurred, File already exist !!!")
print("Moving files from images to oldimages Done !!!")