22

I do not known how to load a eml file in python 3.4.
I want to list all and read all of them in python.

enter image description here

β.εηοιτ.βε
  • 33,893
  • 13
  • 69
  • 83
Bảo Nguyễn
  • 285
  • 1
  • 2
  • 5
  • 3
    Hello and welcome to StackOverflow. Please take some time to read the help page, especially the sections named ["What topics can I ask about here?"](http://stackoverflow.com/help/on-topic) and ["What types of questions should I avoid asking?"](http://stackoverflow.com/help/dont-ask). And more importantly, please read [the Stack Overflow question checklist](http://meta.stackexchange.com/q/156810/204922). You might also want to learn about [Minimal, Complete, and Verifiable Examples](http://stackoverflow.com/help/mcve). – Morgan Thrapp Jul 13 '15 at 20:03
  • 2
    @Dalen Since when do we judge questions based on how "well" they approximate the worst questions on the site? – Two-Bit Alchemist Jul 13 '15 at 21:10

6 Answers6

42

This is how you get content of an e-mail i.e. *.eml file. This works perfectly on Python2.5 - 2.7. Try it on 3. It should work as well.



from email import message_from_file
import os

# Path to directory where attachments will be stored:
path = "./msgfiles"

# To have attachments extracted into memory, change behaviour of 2 following functions:

def file_exists (f):
    """Checks whether extracted file was extracted before."""
    return os.path.exists(os.path.join(path, f))

def save_file (fn, cont):
    """Saves cont to a file fn"""
    file = open(os.path.join(path, fn), "wb")
    file.write(cont)
    file.close()

def construct_name (id, fn):
    """Constructs a file name out of messages ID and packed file name"""
    id = id.split(".")
    id = id[0]+id[1]
    return id+"."+fn

def disqo (s):
    """Removes double or single quotations."""
    s = s.strip()
    if s.startswith("'") and s.endswith("'"): return s[1:-1]
    if s.startswith('"') and s.endswith('"'): return s[1:-1]
    return s

def disgra (s):
    """Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
    s = s.strip()
    if s.startswith("<") and s.endswith(">"): return s[1:-1]
    return s

def pullout (m, key):
    """Extracts content from an e-mail message.
    This works for multipart and nested multipart messages too.
    m   -- email.Message() or mailbox.Message()
    key -- Initial message ID (some string)
    Returns tuple(Text, Html, Files, Parts)
    Text  -- All text from all parts.
    Html  -- All HTMLs from all parts
    Files -- Dictionary mapping extracted file to message ID it belongs to.
    Parts -- Number of parts in original message.
    """
    Html = ""
    Text = ""
    Files = {}
    Parts = 0
    if not m.is_multipart():
        if m.get_filename(): # It's an attachment
            fn = m.get_filename()
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, None)
            if file_exists(cfn): return Text, Html, Files, 1
            save_file(cfn, m.get_payload(decode=True))
            return Text, Html, Files, 1
        # Not an attachment!
        # See where this belongs. Text, Html or some other data:
        cp = m.get_content_type()
        if cp=="text/plain": Text += m.get_payload(decode=True)
        elif cp=="text/html": Html += m.get_payload(decode=True)
        else:
            # Something else!
            # Extract a message ID and a file name if there is one:
            # This is some packed file and name is contained in content-type header
            # instead of content-disposition header explicitly
            cp = m.get("content-type")
            try: id = disgra(m.get("content-id"))
            except: id = None
            # Find file name:
            o = cp.find("name=")
            if o==-1: return Text, Html, Files, 1
            ox = cp.find(";", o)
            if ox==-1: ox = None
            o += 5; fn = cp[o:ox]
            fn = disqo(fn)
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, id)
            if file_exists(cfn): return Text, Html, Files, 1
            save_file(cfn, m.get_payload(decode=True))
        return Text, Html, Files, 1
    # This IS a multipart message.
    # So, we iterate over it and call pullout() recursively for each part.
    y = 0
    while 1:
        # If we cannot get the payload, it means we hit the end:
        try:
            pl = m.get_payload(y)
        except: break
        # pl is a new Message object which goes back to pullout
        t, h, f, p = pullout(pl, key)
        Text += t; Html += h; Files.update(f); Parts += p
        y += 1
    return Text, Html, Files, Parts

def extract (msgfile, key):
    """Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
    msgfile -- A file-like readable object
    key     -- Some ID string for that particular Message. Can be a file name or anything.
    Returns dict()
    Keys: from, to, subject, date, text, html, parts[, files]
    Key files will be present only when message contained binary files.
    For more see __doc__ for pullout() and caption() functions.
    """
    m = message_from_file(msgfile)
    From, To, Subject, Date = caption(m)
    Text, Html, Files, Parts = pullout(m, key)
    Text = Text.strip(); Html = Html.strip()
    msg = {"subject": Subject, "from": From, "to": To, "date": Date,
        "text": Text, "html": Html, "parts": Parts}
    if Files: msg["files"] = Files
    return msg

def caption (origin):
    """Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
    origin -- Message() object
    Returns tuple(From, To, Subject, Date)
    If message doesn't contain one/more of them, the empty strings will be returned.
    """
    Date = ""
    if origin.has_key("date"): Date = origin["date"].strip()
    From = ""
    if origin.has_key("from"): From = origin["from"].strip()
    To = ""
    if origin.has_key("to"): To = origin["to"].strip()
    Subject = ""
    if origin.has_key("subject"): Subject = origin["subject"].strip()
    return From, To, Subject, Date

# Usage:
f = open("message.eml", "rb")
print extract(f, f.name)
f.close()

I programmed this for my mailgroup using mailbox, that is why it is so convoluted. It never failed me. Never any junk. If message is multipart, output dictionary will contain a key "files" (a sub dict) with all filenames of extracted other files that were not text or html. That was a way of extracting attachments and other binary data. You may change it in pullout(), or just change the behaviour of file_exists() and save_file().

construct_name() constructs a filename out of message id and multipart message filename, if there is one.

In pullout() the Text and Html variables are strings. For online mailgroup it was OK to get any text or HTML packed into multipart that wasn't an attachment at once.

If you need something more sophisticated change Text and Html to lists and append to them and add them as needed. Nothing problematic.

Maybe there are some errors here, because it is intended to work with mailbox.Message(), not with email.Message(). I tried it on email.Message() and it worked fine.

You said, you "wish to list them all". From where? If you refer to the POP3 mailbox or a mailbox of some nice open-source mailer, then you do it using mailbox module. If you want to list them from others, then you have a problem. For example, to get mails from MS Outlook, you have to know how to read OLE2 compound files. Other mailers rarely refer to them as *.eml files, so I think this is exactly what you would like to do. Then search on PyPI for olefile or compoundfiles module and Google around for how to extract an e-mail from MS Outlook inbox file. Or save yourself a mess and just export them from there to some directory. When you have them as eml files, then apply this code.

Dalen
  • 4,128
  • 1
  • 17
  • 35
  • 1
    Please elaborate on this answer and do not simply link off site. Your link could break which would invalidate this answer. – Two-Bit Alchemist Jul 13 '15 at 21:11
  • @Two-Bit Alchemist, since never. I was just a bit sympathetic and got myself down voted because of that. – Dalen Jul 13 '15 at 23:09
  • 1
    @Two-Bit Alchemist, As the link is on SO, I hope it won't break. I extracted that piece of code in haste. I'll copy it here and comment it properly soon, but, let b.enoit.be have a go first. :D – Dalen Jul 13 '15 at 23:13
  • @Two-BitAlchemist Alchemist, I elaborated! Are you happy now? – Dalen Jul 15 '15 at 00:02
  • I changed my vote, if that's what you mean. It's a very nice-looking answer now, although I didn't test your code. Thank you for contributing! – Two-Bit Alchemist Jul 15 '15 at 14:35
  • @Dalen i keep getting the error, "expected string got bytes" , and if i try to give it string it says "expected bytes got string" – Rohan Baisantry May 04 '18 at 14:02
  • At which point of your program exactly do you get this error? – Dalen May 04 '18 at 15:04
  • 1
    @Dalen, when message_from_file() is called. i pass the path of the .eml file as the agrument. – Rohan Baisantry May 07 '18 at 06:20
  • @RohanBaisantry : Didn't you see the usage? You pass the open file descriptor in, not the path to the eml file. – Dalen May 08 '18 at 12:05
  • 1
    Great work! After a few changes, I now got it running in Python 3. The only part that confuses me is the "# Something else!" part. My code never gets there. Also, I would suggest decoding the filenames, as one else will run into to so pretty ugly filenames that crash on occasion. – Esben Eickhardt Oct 08 '19 at 14:01
  • @EsbenEickhardt : # Something else part will be executed if your email e.g. has images or if it is digitally signed. That part of the code assumes that that part of the e-mail is a binary 'thing' that has to be treated as an attachment as this code wasn't used to display them, just to extract parts from it. Then, if I wish to display such images I modified the HTML tag to reflect my new path. – Dalen Oct 08 '19 at 14:27
  • @EsbenEickhardt : As for file names, bare in mind that mailers somewhat changed since this code was written. Also, take care that construct_name(), as given here, is not safe. Someone can put relative paths into the attachment's name and thus write things all over your disk. I repeat, I extracted this code from a larger piece, so this is not exactly the production version of it. Also, please post a comment with changes necessary for the code to work on Python3. So that other people have a little less work to do. – Dalen Oct 08 '19 at 14:44
  • @Dalen When I have make the code handle the unpacking of "smime.p7s", I will make sure to post it. – Esben Eickhardt Oct 09 '19 at 07:50
  • 2
    @EsbenEickhardt what did you change to make it work on python3 I'm stuck at the `string argument expected, got 'bytes'` – TheCrazyProfessor Mar 14 '22 at 12:33
  • @TheCrazyProfessor here is my code https://github.com/esbeneickhardt/emlTextExtraction there is an example of the use here: https://github.com/esbeneickhardt/emlTextExtraction/blob/master/notebook/test.ipynb – Esben Eickhardt Mar 15 '22 at 09:55
  • `if` and `:` and another order on the same line. Wow. So useful when you want to add a breakpoint... and so nice for clarity too... `black` to the rescue!!! – Olivier Pons Sep 29 '22 at 20:38
  • Use `Text += str(m.get_payload(decode=True).decode('utf-8'))` to get `str` text. Else, it returns a `byte` type object. – Bikash Gyawali May 25 '23 at 15:23
  • 1
    @BikashGyawali : Good point, although putting it into str() shouldn't be necessary as *.decode("utf-8") will return str(). I also advice making the bytes().decode() ignore errors, just in case: Text += m.get_payload(decode=True).decode("UTF-8", "ignore") Thank you for helping Py3 users with my mess. :D – Dalen May 25 '23 at 23:05
27

I found this code much simpler

import email
import os

path = './'
listing = os.listdir(path)

for fle in listing:
    if str.lower(fle[-3:])=="eml":
        msg = email.message_from_file(open(fle))
        attachments=msg.get_payload()
        for attachment in attachments:
            try:
                fnam=attachment.get_filename()
                f=open(fnam, 'wb').write(attachment.get_payload(decode=True,))
                f.close()
            except Exception as detail:
                #print detail
                pass
albert
  • 8,112
  • 3
  • 47
  • 63
Mike
  • 271
  • 3
  • 3
  • 5
    Certainly simpler, but much more limited. I explained why mine is so warped. People will use one they find better suited for their needs. You have +1 from me for offering another solution. – Dalen Apr 28 '17 at 16:06
  • i got FileNotFoundError but i dont know why as I'm getting the file names from the listing – axel axel May 15 '19 at 12:39
  • Even for this I suggest removing try&except since unused variable (flake8 F841) named `detail` hides flake E722 (bare except) from pyflakes. "except Exception as detail: pass" can be read as "my program maybe corrupted or destroyed a file, or whatever, or nothing, but I don't care". Instead, it is better to only handle each specific type of exception (1 or more except clauses), & do something specific in each--`print` wouldn't prevent anything so it should be followed by `return None`, `sys.exit(1)`, or at least `continue` usually. Flake8 issues (&some PEP8 ones) aren't "rules" but red flags. – Poikilos Jan 15 '23 at 03:54
10

Posting this here for anyone looking to just extract text from an email and get a list of .eml files - took me forever to find a good answer to this online. NOTE: This will not get attachments to emails, just the text from email.

import email
from email import policy
from email.parser import BytesParser
import glob
import os

path = '/path/to/data/' # set this to "./" if in current directory

eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
    with open(eml_file, 'rb') as fp:  # select a specific email file from the list
        name = fp.name # Get file name
        msg = BytesParser(policy=policy.default).parse(fp)
    text = msg.get_body(preferencelist=('plain')).get_content()
    fp.close()
 
    text = text.split("\n")
    print (name) # Get name of eml file
    print (text) # Get list of all text in email

Credit to some of the code from this post: Reading .eml files with Python 3.6 using emaildata 0.3.4

Colonel_Old
  • 852
  • 9
  • 15
5

Python 3 version of Dalen's answer. Basically syntax issue fixes. (Can't comment due to lack of reputation, also clearer as an answer).

# To have attachments extracted into memory, change behaviour of 2 following functions:

def file_exists (f):
    """Checks whether extracted file was extracted before."""
    return os.path.exists(os.path.join(path, f))

def save_file (fn, cont):
    """Saves cont to a file fn"""
    file = open(os.path.join(path, fn), "wb")
    file.write(cont)
    file.close()

def construct_name (id, fn):
    """Constructs a file name out of messages ID and packed file name"""
    id = id.split(".")
    id = id[0]+id[1]
    return id+"."+fn

def disqo (s):
    """Removes double or single quotations."""
    s = s.strip()
    if s.startswith("'") and s.endswith("'"): return s[1:-1]
    if s.startswith('"') and s.endswith('"'): return s[1:-1]
    return s

def disgra (s):
    """Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
    s = s.strip()
    if s.startswith("<") and s.endswith(">"): return s[1:-1]
    return s

def pullout (m, key):
    """Extracts content from an e-mail message.
    This works for multipart and nested multipart messages too.
    m   -- email.Message() or mailbox.Message()
    key -- Initial message ID (some string)
    Returns tuple(Text, Html, Files, Parts)
    Text  -- All text from all parts.
    Html  -- All HTMLs from all parts
    Files -- Dictionary mapping extracted file to message ID it belongs to.
    Parts -- Number of parts in original message.
    """
    Html = ""
    Text = ""
    Files = {}
    Parts = 0
    if not m.is_multipart():
        if m.get_filename(): # It's an attachment
            fn = m.get_filename()
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, None)
            if file_exists(cfn): return Text, Html, Files, 1
            save_file(cfn, m.get_payload(decode=True))
            return Text, Html, Files, 1
        # Not an attachment!
        # See where this belongs. Text, Html or some other data:
        cp = m.get_content_type()
        if cp=="text/plain":
            Text += str(m.get_payload(decode=True))
        elif cp=="text/html":
            Html += str(m.get_payload(decode=True))
        else:
            # Something else!
            # Extract a message ID and a file name if there is one:
            # This is some packed file and name is contained in content-type header
            # instead of content-disposition header explicitly
            cp = m.get("content-type")
            try: id = disgra(m.get("content-id"))
            except: id = None
            # Find file name:
            o = cp.find("name=")
            if o==-1: return Text, Html, Files, 1
            ox = cp.find(";", o)
            if ox==-1: ox = None
            o += 5; fn = cp[o:ox]
            fn = disqo(fn)
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, id)
            if file_exists(cfn): return Text, Html, Files, 1
            save_file(cfn, m.get_payload(decode=True))
        return Text, Html, Files, 1
    # This IS a multipart message.
    # So, we iterate over it and call pullout() recursively for each part.
    y = 0
    while 1:
        # If we cannot get the payload, it means we hit the end:
        try:
            pl = m.get_payload(y)
        except: break
        # pl is a new Message object which goes back to pullout
        t, h, f, p = pullout(pl, key)
        Text += t; Html += h; Files.update(f); Parts += p
        y += 1
    return Text, Html, Files, Parts

def extract (msgfile, key):
    """Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
    msgfile -- A file-like readable object
    key     -- Some ID string for that particular Message. Can be a file name or anything.
    Returns dict()
    Keys: from, to, subject, date, text, html, parts[, files]
    Key files will be present only when message contained binary files.
    For more see __doc__ for pullout() and caption() functions.
    """
    m = email.message_from_file(msgfile)
    From, To, Subject, Date = caption(m)
    Text, Html, Files, Parts = pullout(m, key)
    Text = Text.strip(); Html = Html.strip()
    msg = {"subject": Subject, "from": From, "to": To, "date": Date,
        "text": Text, "html": Html, "parts": Parts}
    if Files: msg["files"] = Files
    return msg

def caption (origin):
    """Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
    origin -- Message() object
    Returns tuple(From, To, Subject, Date)
    If message doesn't contain one/more of them, the empty strings will be returned.
    """
    Date = ""
    if origin.__contains__("date"): Date = origin["date"].strip()
    From = ""
    if origin.__contains__("from"): From = origin["from"].strip()
    To = ""
    if origin.__contains__("to"): To = origin["to"].strip()
    Subject = ""
    if origin.__contains__("subject"): Subject = origin["subject"].strip()
    return From, To, Subject, Date
Cristian Ciupitu
  • 20,270
  • 7
  • 50
  • 76
lastchancexi
  • 131
  • 2
  • 4
2

Try this:

#!python3
# -*- coding: utf-8 -*-

import email
import os

SOURCE_DIR = 'email'
DEST_DIR = 'temp'

def extractattachements(fle,suffix=None):
    message = email.message_from_file(open(fle))
    filenames = []
    if message.get_content_maintype() == 'multipart':
        for part in message.walk():
            if part.get_content_maintype() == 'multipart': continue
            #if part.get('Content-Disposition') is None: continue
            if part.get('Content-Type').find('application/octet-stream') == -1: continue
            filename = part.get_filename()
            if suffix:
                filename = ''.join( [filename.split('.')[0], '_', suffix, '.', filename.split('.')[1]])
            filename = os.path.join(DEST_DIR, filename)
            fb = open(filename,'wb')
            fb.write(part.get_payload(decode=True))
            fb.close()
            filenames.append(filename)
    return filenames

def main():
    onlyfiles = [f for f in os.listdir(SOURCE_DIR) if os.path.isfile(os.path.join(SOURCE_DIR, f))]
    for file in onlyfiles:
        #print path.join(SOURCE_DIR,file)
        extractattachements(os.path.join(SOURCE_DIR,file))
    return True

if __name__ == "__main__":
    main()
IvanTheFirst
  • 101
  • 1
  • 2
  • 1
    I like your answer, but be aware that attachments can be of any type like (image/jpeg, audio/mpeg, application/msword...), not only application/octet-stream. Better indicator of whether the attachment is actual attachment or a part of the email itself is the content-disposition header, and you checked for that, then you commented it out. Why is that? If it has some exact purpose, please write a comment about the reason for it. – Dalen Apr 27 '20 at 10:24
1

Here I am simplifying things for you so that you can get a more clear data to process on .....

.eml will consist of 2 parts on broad level 1) Headers 2)Content/Body

(Note it will discard any attachements if they are there)

Moreover I've removed https links also from .eml file but I'll tell you what to do if you want them .

1) Header : So I used eml-parser to get Header information you can install it using :

pip install eml-parser

View their documentation to get more info about how to get headers : https://pypi.org/project/eml-parser/

2)Content/Body : Now here I modified some older scripts to get best result in output

from email import policy
from email.parser import BytesParser
import glob
import os

path = './' # set this to "./" if in current directory

eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
    with open(eml_file, 'rb') as fp:  # select a specific email file from the list
        name = fp.name # Get file name
        msg = BytesParser(policy=policy.default).parse(fp)
    text = msg.get_body(preferencelist=('plain')).get_content()
    fp.close()
    
    print (name) # Get name of eml file
    # print (text) # Get list of all text in email

This is a part of code which was already available on many places and of which I don't take credit of......

Now I've added few conditions to print out the body in more pretty way these lines of code are mine and you can give me credit for that :

newText = ""
flag = 0
urlFlag = 0
for i in range(len(text)):
    if(flag==1):
        flag = 0
        continue
    if(text[i]=="\\"):
        flag = 1
        continue
    if(text[i]=='<'): //to remove hyperlinks
        urlFlag = 1
        continue
    if(text[i]=='>'): //to remove hyperlinks
        urlFlag = 0
        continue
    if(urlFlag==0): //to remove hyperlinks
        newText = newText+text[i]

print(newText)

Now this will remove all the break-lines , tab space and other stuff (\t,\r,\n)

Moreover if you want to have links (http,https links present in your .eml file) then just remove 3 conditions and new code will look like :

newText = ""
flag = 0
urlFlag = 0
for i in range(len(text)):
    if(flag==1):
        flag = 0
        continue
    if(text[i]=="\\"):
        flag = 1
        continue
    newText = newText+text[i]

print(newText)

Final Code (with removing links) :

from email import policy
from email.parser import BytesParser
import glob
import os

path = './' # set this to "./" if in current directory

eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
    with open(eml_file, 'rb') as fp:  # select a specific email file from the list
        name = fp.name # Get file name
        msg = BytesParser(policy=policy.default).parse(fp)
    text = msg.get_body(preferencelist=('plain')).get_content()
    fp.close()
    
    print (name) # Get name of eml file
    # print (text) # Get list of all text in email

    

    newText = ""
    flag = 0
    urlFlag = 0
    for i in range(len(text)):
        if(flag==1):
            flag = 0
            continue
        if(text[i]=="\\"):
            flag = 1
            continue
        if(text[i]=='<'):
            urlFlag = 1
            continue
        if(text[i]=='>'):
            urlFlag = 0
            continue
        if(urlFlag==0):
            newText = newText+text[i]

    print(newText)

This is my 1st answer on StackOverflow hope this will help you guys ! My Python version is : 3.8.10