From b6b40134fe9a59fbb3ebd17955720b74fb0bfe9e Mon Sep 17 00:00:00 2001 From: tripleee Date: Thu, 1 Oct 2020 20:08:06 +0300 Subject: [PATCH 1/6] projects/Store_emails_in_csv/store_emails.py: PEP8 up Also, create `def main` so the toplevel can be imported by other projects if they want to --- projects/Store_emails_in_csv/store_emails.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/projects/Store_emails_in_csv/store_emails.py b/projects/Store_emails_in_csv/store_emails.py index af0383f6f..48cb0c684 100644 --- a/projects/Store_emails_in_csv/store_emails.py +++ b/projects/Store_emails_in_csv/store_emails.py @@ -1,13 +1,15 @@ #!/usr/bin/env python -import imaplib +import csv import email from email import policy -import csv -import ssl +import imaplib import os +import ssl + from bs4 import BeautifulSoup + credential_path = os.getcwd() + "/credentials.txt" csv_path = os.getcwd() + "/mails.csv" @@ -88,8 +90,7 @@ def write_to_csv(mail, writer): writer.writerow(row) -if __name__ == "__main__": - +def main(): mail, messages = connect_to_mailbox() total_no_of_mails = int(messages[0]) @@ -104,3 +105,7 @@ def write_to_csv(mail, writer): write_to_csv(mail, writer) except Exception as e: print(e) + + +if __name__ == "__main__": + main() From b0f8cda0f33604224f4e9162b6b245c5c7444f18 Mon Sep 17 00:00:00 2001 From: tripleee Date: Thu, 1 Oct 2020 20:11:59 +0300 Subject: [PATCH 2/6] projects/Store_emails_in_csv/store_emails.py: remove os.getcwd() Your computer already knows which directory you are in --- projects/Store_emails_in_csv/store_emails.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/Store_emails_in_csv/store_emails.py b/projects/Store_emails_in_csv/store_emails.py index 48cb0c684..550a7e21d 100644 --- a/projects/Store_emails_in_csv/store_emails.py +++ b/projects/Store_emails_in_csv/store_emails.py @@ -10,8 +10,8 @@ from bs4 import BeautifulSoup -credential_path = os.getcwd() + "/credentials.txt" -csv_path = os.getcwd() + "/mails.csv" +credential_path = "credentials.txt" +csv_path = "mails.csv" host = "imap.gmail.com" port = 993 From af9610030bc57b7439cd6c2bd279ca03b948ddf8 Mon Sep 17 00:00:00 2001 From: tripleee Date: Thu, 1 Oct 2020 20:23:08 +0300 Subject: [PATCH 3/6] projects/Store_emails_in_csv/store_emails.py: don't silently eat exceptions import logging; display a log message when something goes wrong --- projects/Store_emails_in_csv/store_emails.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/projects/Store_emails_in_csv/store_emails.py b/projects/Store_emails_in_csv/store_emails.py index 550a7e21d..f42965b4e 100644 --- a/projects/Store_emails_in_csv/store_emails.py +++ b/projects/Store_emails_in_csv/store_emails.py @@ -4,6 +4,7 @@ import email from email import policy import imaplib +import logging import os import ssl @@ -13,6 +14,8 @@ credential_path = "credentials.txt" csv_path = "mails.csv" +logger = logging.getLogger('imap_poller') + host = "imap.gmail.com" port = 993 ssl_context = ssl.create_default_context() @@ -66,8 +69,8 @@ def write_to_csv(mail, writer): "utf-8" ) email_text = get_text(email_body) - except Exception: - pass + except Exception as exc: + logger.warning('Caught exception: %r', exc) if ( content_type == "text/plain" and "attachment" not in content_disposition @@ -93,6 +96,8 @@ def write_to_csv(mail, writer): def main(): mail, messages = connect_to_mailbox() + logging.basicConfig(level=logging.WARNING) + total_no_of_mails = int(messages[0]) # no. of latest mails to fetch # set it equal to total_no_of_emails to fetch all mail in the inbox @@ -103,8 +108,8 @@ def main(): writer.writerow(["Date", "From", "Subject", "Text mail"]) try: write_to_csv(mail, writer) - except Exception as e: - print(e) + except Exception as exc: + logger.warning('Caught exception: %r', exc) if __name__ == "__main__": From a18615ef6b9f91cae04959ed24f11e245d87202f Mon Sep 17 00:00:00 2001 From: tripleee Date: Thu, 1 Oct 2020 20:26:20 +0300 Subject: [PATCH 4/6] projects/Store_emails_in_csv/README.md: update with project description Merged in upstream changes into my work in progress --- projects/Store_emails_in_csv/README.md | 85 ++++++++++++++++---------- 1 file changed, 54 insertions(+), 31 deletions(-) diff --git a/projects/Store_emails_in_csv/README.md b/projects/Store_emails_in_csv/README.md index 1167878ea..d452b5043 100644 --- a/projects/Store_emails_in_csv/README.md +++ b/projects/Store_emails_in_csv/README.md @@ -1,31 +1,54 @@ -# Store mails in your inbox in csv format - -1)This script takes your email and password as input. - -2)Return a csv containing following attributes: - --Date - --From(Sender) - --Subject - --Mail Text - - -## Prerequisites - -You only need Python to run this script. You can visit [here](https://www.python.org/downloads/) to download Python. - - -## How to run the script - -Running the script is really simple! Just open a terminal in the folder where your script is located and run the following command : - - `pip install -r requirements.txt` - `python store_emails.py` - - -## *Author Name* - -gpriya32(Priyanka) +# Store emails in CSV + +This project contains a simple script to extract email messages +from an IMAP server. + +The messages are written to a simple four-column CSV file. + + +## Dependencies + +This depends on the BeautifulSoup library +for extracting text from HTML messages. + + +## Running the script + +You will need to have a file `credentials.txt` +with your IMAP server account name and password on separate lines. + +This generates `mails.csv` in the current directory. + +``` +python store_emails.py +``` + +The generate CSV file contains the following fields for each message: + +* Date +* From (Sender) +* Subject +* Message text + + +## Development ideas + +This hardcodes the IMAP server for Gmail.com and the `"INBOX"` folder. +Perhaps this should be configured outside of the code +for easier customization. + +This brutally marks all messages as read. +Perhaps make it `PEEK` so as to not change the message flags. + +This will read everything in the `INBOX` folder. +It could be useful to make it remember which messages it has already seen, +and update a CSV file only with information from messages which have +arrived since the previous poll. + +It might be useful to be able to specify which messages to fetch, +instead of have it fetch everything every time. + + +## Author Name + +Aditya Jetely (@AdityaJ7) From 5665e14703277f14b6bc2ccc4cef8092834255e1 Mon Sep 17 00:00:00 2001 From: tripleee Date: Tue, 27 Oct 2020 08:23:08 +0200 Subject: [PATCH 5/6] README.md: additional updates --- projects/Store_emails_in_csv/README.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/projects/Store_emails_in_csv/README.md b/projects/Store_emails_in_csv/README.md index d452b5043..7d1e8ec85 100644 --- a/projects/Store_emails_in_csv/README.md +++ b/projects/Store_emails_in_csv/README.md @@ -17,13 +17,20 @@ for extracting text from HTML messages. You will need to have a file `credentials.txt` with your IMAP server account name and password on separate lines. -This generates `mails.csv` in the current directory. +Gmail - and many other IMAP providers - +requires you to create a separate "application password" +to allow this code to run, so probably do that first. +Then put that password in `credentials.txt`. + +Then simply run ``` python store_emails.py ``` -The generate CSV file contains the following fields for each message: +This generates `mails.csv` in the current directory. + +The generated CSV file contains the following fields for each message: * Date * From (Sender) @@ -48,6 +55,8 @@ arrived since the previous poll. It might be useful to be able to specify which messages to fetch, instead of have it fetch everything every time. +The exception handling is not a good example of how to do this properly. + ## Author Name From 6866c61ba88849fe8151dda067a590f3ccafe87c Mon Sep 17 00:00:00 2001 From: tripleee Date: Tue, 27 Oct 2020 08:23:43 +0200 Subject: [PATCH 6/6] store_emails.py: fix unbound variable (!) Also, guard against None being returned from body extraction Also, update requirements.txt to include lxml for bs4 --- projects/Store_emails_in_csv/README.md | 2 +- projects/Store_emails_in_csv/requirements.txt | 1 + projects/Store_emails_in_csv/store_emails.py | 28 ++++++++++--------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/projects/Store_emails_in_csv/README.md b/projects/Store_emails_in_csv/README.md index 7d1e8ec85..24b71355d 100644 --- a/projects/Store_emails_in_csv/README.md +++ b/projects/Store_emails_in_csv/README.md @@ -8,7 +8,7 @@ The messages are written to a simple four-column CSV file. ## Dependencies -This depends on the BeautifulSoup library +This depends on the BeautifulSoup library and `lxml` for extracting text from HTML messages. diff --git a/projects/Store_emails_in_csv/requirements.txt b/projects/Store_emails_in_csv/requirements.txt index c1f5f713c..6f83e94d2 100644 --- a/projects/Store_emails_in_csv/requirements.txt +++ b/projects/Store_emails_in_csv/requirements.txt @@ -1 +1,2 @@ beautifulsoup4 +lxml diff --git a/projects/Store_emails_in_csv/store_emails.py b/projects/Store_emails_in_csv/store_emails.py index f42965b4e..ed0353117 100644 --- a/projects/Store_emails_in_csv/store_emails.py +++ b/projects/Store_emails_in_csv/store_emails.py @@ -41,7 +41,7 @@ def get_text(email_body): return soup.get_text(separator="\n", strip=True) -def write_to_csv(mail, writer): +def write_to_csv(mail, writer, N, total_no_of_mails): for i in range(total_no_of_mails, total_no_of_mails - N, -1): res, data = mail.fetch(str(i), "(RFC822)") @@ -65,10 +65,9 @@ def write_to_csv(mail, writer): content_disposition = str(part.get("Content-Disposition")) try: # get the email email_body - email_body = part.get_payload(decode=True).decode( - "utf-8" - ) - email_text = get_text(email_body) + email_body = part.get_payload(decode=True) + if email_body: + email_text = get_text(email_body.decode('utf-8')) except Exception as exc: logger.warning('Caught exception: %r', exc) if ( @@ -85,13 +84,16 @@ def write_to_csv(mail, writer): # extract content type of email content_type = msg.get_content_type() # get the email email_body - email_body = msg.get_payload(decode=True).decode("utf-8") - email_text = get_text(email_body) - - # Write data in the csv file - row = [email_date, email_from, email_subject, email_text] - writer.writerow(row) - + email_body = msg.get_payload(decode=True) + if email_body: + email_text = get_text(email_body.decode('utf-8')) + + if email_text is not None: + # Write data in the csv file + row = [email_date, email_from, email_subject, email_text] + writer.writerow(row) + else: + logger.warning('%s:%i: No message extracted', "INBOX", i) def main(): mail, messages = connect_to_mailbox() @@ -107,7 +109,7 @@ def main(): writer = csv.writer(fw) writer.writerow(["Date", "From", "Subject", "Text mail"]) try: - write_to_csv(mail, writer) + write_to_csv(mail, writer, N, total_no_of_mails) except Exception as exc: logger.warning('Caught exception: %r', exc)