0

i have a PDF below

i have to create bounding box for retrieving values of date, Due Date , Invoice # , Amount Due , Address and some other field . but the bounding box i create is not extracting me correct values . The code should work for any type of invoice template and should retrieve me the needed values .

enter image description here below is the needed information from invoice

enter image description here

but the box is not getting created properly .

Below is my code

    # Maximum distance to merge boxes
max_distance = 50

# Merge nearby boxes
boxes = []
for i in range(n_boxes):
    if int(d['conf'][i]) > 50:
        (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        merged = False
        for j in range(len(boxes)):
            (bx, by, bw, bh) = boxes[j]
            if abs(x - bx) < max_distance and abs(y - by) < max_distance:
                boxes[j] = (min(x, bx), min(y, by), max(x + w, bx + bw) - min(x, bx), max(y + h, by + bh) - min(y, by))
                merged = True
                break
            elif abs(x - bx) < max_distance and abs(y - (by + bh)) < max_distance:
                boxes[j] = (min(x, bx), min(y, by), max(x + w, bx + bw) - min(x, bx), max(y + h, by + bh) - min(y, by))
                merged = True
                break
            elif abs(x - (bx + bw)) < max_distance and abs(y - by) < max_distance:
                boxes[j] = (min(x, bx), min(y, by), max(x + w, bx + bw) - min(x, bx), max(y + h, by + bh) - min(y, by))
                merged = True
                break
        if not merged:
            boxes.append((x, y, w, h))

# Extract text from merged boxes
for (x, y, w, h) in boxes:
    cropped_img = image[y:y+h, x:x+w]
    text = pytesseract.image_to_string(cropped_img)
    print(f'Text in box ({x}, {y}, {w}, {h}): {text.strip()}')

    # Draw merged boxes on image
    image = cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

# Display image
b,g,r = cv2.split(image)
rgb_img = cv2.merge([r,g,b])
plt.figure(figsize=(16,12))
plt.imshow(rgb_img)
plt.title('SAMPLE INVOICE WITH MERGED BOXES FOR KEY-VALUE PAIRS')
plt.show()
bay bay
  • 11
  • 1

0 Answers0