diff --git a/projects/Store_emails_in_csv/README.md b/projects/Store_emails_in_csv/README.md index 1167878ea..24b71355d 100644 --- a/projects/Store_emails_in_csv/README.md +++ b/projects/Store_emails_in_csv/README.md @@ -1,31 +1,63 @@ -# Store mails in your inbox in csv format - -1)This script takes your email and password as input. - -2)Return a csv containing following attributes: - --Date - --From(Sender) - --Subject - --Mail Text - - -## Prerequisites - -You only need Python to run this script. You can visit [here](https://www.python.org/downloads/) to download Python. - - -## How to run the script - -Running the script is really simple! Just open a terminal in the folder where your script is located and run the following command : - - `pip install -r requirements.txt` - `python store_emails.py` - - -## *Author Name* - -gpriya32(Priyanka) +# Store emails in CSV + +This project contains a simple script to extract email messages +from an IMAP server. + +The messages are written to a simple four-column CSV file. + + +## Dependencies + +This depends on the BeautifulSoup library and `lxml` +for extracting text from HTML messages. + + +## Running the script + +You will need to have a file `credentials.txt` +with your IMAP server account name and password on separate lines. + +Gmail - and many other IMAP providers - +requires you to create a separate "application password" +to allow this code to run, so probably do that first. +Then put that password in `credentials.txt`. + +Then simply run + +``` +python store_emails.py +``` + +This generates `mails.csv` in the current directory. + +The generated CSV file contains the following fields for each message: + +* Date +* From (Sender) +* Subject +* Message text + + +## Development ideas + +This hardcodes the IMAP server for Gmail.com and the `"INBOX"` folder. +Perhaps this should be configured outside of the code +for easier customization. + +This brutally marks all messages as read. +Perhaps make it `PEEK` so as to not change the message flags. + +This will read everything in the `INBOX` folder. +It could be useful to make it remember which messages it has already seen, +and update a CSV file only with information from messages which have +arrived since the previous poll. + +It might be useful to be able to specify which messages to fetch, +instead of have it fetch everything every time. + +The exception handling is not a good example of how to do this properly. + + +## Author Name + +Aditya Jetely (@AdityaJ7) diff --git a/projects/Store_emails_in_csv/requirements.txt b/projects/Store_emails_in_csv/requirements.txt index c1f5f713c..6f83e94d2 100644 --- a/projects/Store_emails_in_csv/requirements.txt +++ b/projects/Store_emails_in_csv/requirements.txt @@ -1 +1,2 @@ beautifulsoup4 +lxml diff --git a/projects/Store_emails_in_csv/store_emails.py b/projects/Store_emails_in_csv/store_emails.py index af0383f6f..ed0353117 100644 --- a/projects/Store_emails_in_csv/store_emails.py +++ b/projects/Store_emails_in_csv/store_emails.py @@ -1,15 +1,20 @@ #!/usr/bin/env python -import imaplib +import csv import email from email import policy -import csv -import ssl +import imaplib +import logging import os +import ssl + from bs4 import BeautifulSoup -credential_path = os.getcwd() + "/credentials.txt" -csv_path = os.getcwd() + "/mails.csv" + +credential_path = "credentials.txt" +csv_path = "mails.csv" + +logger = logging.getLogger('imap_poller') host = "imap.gmail.com" port = 993 @@ -36,7 +41,7 @@ def get_text(email_body): return soup.get_text(separator="\n", strip=True) -def write_to_csv(mail, writer): +def write_to_csv(mail, writer, N, total_no_of_mails): for i in range(total_no_of_mails, total_no_of_mails - N, -1): res, data = mail.fetch(str(i), "(RFC822)") @@ -60,12 +65,11 @@ def write_to_csv(mail, writer): content_disposition = str(part.get("Content-Disposition")) try: # get the email email_body - email_body = part.get_payload(decode=True).decode( - "utf-8" - ) - email_text = get_text(email_body) - except Exception: - pass + email_body = part.get_payload(decode=True) + if email_body: + email_text = get_text(email_body.decode('utf-8')) + except Exception as exc: + logger.warning('Caught exception: %r', exc) if ( content_type == "text/plain" and "attachment" not in content_disposition @@ -80,18 +84,22 @@ def write_to_csv(mail, writer): # extract content type of email content_type = msg.get_content_type() # get the email email_body - email_body = msg.get_payload(decode=True).decode("utf-8") - email_text = get_text(email_body) - - # Write data in the csv file - row = [email_date, email_from, email_subject, email_text] - writer.writerow(row) - - -if __name__ == "__main__": + email_body = msg.get_payload(decode=True) + if email_body: + email_text = get_text(email_body.decode('utf-8')) + + if email_text is not None: + # Write data in the csv file + row = [email_date, email_from, email_subject, email_text] + writer.writerow(row) + else: + logger.warning('%s:%i: No message extracted', "INBOX", i) +def main(): mail, messages = connect_to_mailbox() + logging.basicConfig(level=logging.WARNING) + total_no_of_mails = int(messages[0]) # no. of latest mails to fetch # set it equal to total_no_of_emails to fetch all mail in the inbox @@ -101,6 +109,10 @@ def write_to_csv(mail, writer): writer = csv.writer(fw) writer.writerow(["Date", "From", "Subject", "Text mail"]) try: - write_to_csv(mail, writer) - except Exception as e: - print(e) + write_to_csv(mail, writer, N, total_no_of_mails) + except Exception as exc: + logger.warning('Caught exception: %r', exc) + + +if __name__ == "__main__": + main()