This may help:
import re
from lxml import html
BASE_NAME = "image_"
source_code = """<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA
AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
9TXL0Y4OHwAAAABJRU5ErkJggg==" alt="Red dot" />
<img src="data:image/gif;base64,R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs=" alt="Black dot" />"""
tree = html.fromstring(source_code)
for i,image in enumerate(tree.xpath('//img[contains(@src, "data:image")]/@src')):
image_type, image_content = image.split(',', 1)
image_type = re.findall('data:image\/(\w+);base64', image_type)[0]
with open("{}{}.{}".format(BASE_NAME, i, image_type), "wb") as f:
f.write(image_content.decode('base64'))
print "[*] '{}' image found with content: {}\n".format(image_type, image_content)
Output:
[*] 'png' image found with content: iVBORw0KGgoAAAANSUhEUgAAAAUA
AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO
9TXL0Y4OHwAAAABJRU5ErkJggg==
[*] 'gif' image found with content: R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs=
It will save every base64
image within <img>
tags, with their respective file extension:
Prefixed by BASE_NAME + auto-increment digit(s) provided by enumerate
+ image_extension
