#!/usr/bin/env python # coding: utf-8 # # Mining the Social Web # # ## Mining Mailboxes # # This Jupyter Notebook provides an interactive way to follow along with and explore the examples from the video series. The intent behind this notebook is to reinforce the concepts in a fun, convenient, and effective way. # ## Converting a toy mailbox to JSON # In[ ]: import mailbox # pip install mailbox import json # In[ ]: MBOX = 'resources/ch07-mailboxes/data/northpole.mbox' # In[ ]: # A routine that makes a ton of simplifying assumptions # about converting an mbox message into a Python object # given the nature of the northpole.mbox file in order # to demonstrate the basic parsing of an mbox with mail # utilities def objectify_message(msg): # Map in fields from the message o_msg = dict([ (k, v) for (k,v) in msg.items() ]) # Assume one part to the message and get its content # and its content type part = [p for p in msg.walk()][0] o_msg['contentType'] = part.get_content_type() o_msg['content'] = part.get_payload() return o_msg # In[ ]: # Create an mbox that can be iterated over and transform each of its # messages to a convenient JSON representation mbox = mailbox.mbox(MBOX) messages = [] for msg in mbox: messages.append(objectify_message(msg)) print(json.dumps(messages, indent=1)) # ## Downloading the Enron email corpus # In[ ]: import sys from urllib.request import urlopen import time import os import envoy # pip install envoy URL = "http://www.cs.cmu.edu/~enron/enron_mail_20110402.tgz" DOWNLOAD_DIR = "resources/ch07-mailboxes/data" # Downloads a file and displays a download status every 5 seconds def download(url, download_dir): file_name = url.split('/')[-1] u = urlopen(url) f = open(os.path.join(download_dir, file_name), 'wb') meta = u.info() file_size = int(meta['Content-Length']) print("Downloading: %s Bytes: %s" % (file_name, file_size)) file_size_dl = 0 block_sz = 8192 last_update = time.time() while True: buffer = u.read(block_sz) if not buffer: break file_size_dl += len(buffer) f.write(buffer) download_status = r"%10d MB [%5.2f%%]" % (file_size_dl / 1000000.0, file_size_dl * 100.0 / file_size) download_status = download_status + chr(8)*(len(download_status)+1) if time.time() - last_update > 5: print(download_status) sys.stdout.flush() last_update = time.time() f.close() return f.name # Extracts a gzipped tarfile. e.g. "$ tar xzf filename.tgz" def tar_xzf(f): # Call out to the shell for a faster decompression. # This will still take a while because Vagrant synchronizes # thousands of files that are extracted to the host machine r = envoy.run("tar xzf %s -C %s" % (f, DOWNLOAD_DIR)) print(r.std_out) print(r.std_err) f = download(URL, DOWNLOAD_DIR) print("Download complete: %s" % (f,)) tar_xzf(f) print("Decompression complete") print("Data is ready") # ## Converting the Enron corpus to a standardized mbox format # # The results of the sample code below have been saved as a file, `enron.mbox.bz2`, in a compressed format. You may decompress is to `enron.mbox` using whatever tool you prefer, appropriate to your computer's operating system. On UNIX-like systems, the file may be decompressed with the command: # # `tar -xjf enron.mbox.bz2` # In[ ]: import re import email from time import asctime import os import sys from dateutil.parser import parse # pip install python_dateutil # XXX: Download the Enron corpus to resources/ch07-mailboxes/data # and unarchive it there. MAILDIR = 'resources/ch07-mailboxes/data/enron_mail_20110402/maildir' # Where to write the converted mbox MBOX = 'resources/ch07-mailboxes/data/enron.mbox' # Create a file handle that we'll be writing into... mbox = open(MBOX, 'w+') # Walk the directories and process any folder named 'inbox' for (root, dirs, file_names) in os.walk(MAILDIR): if root.split(os.sep)[-1].lower() != 'inbox': continue # Process each message in 'inbox' for file_name in file_names: file_path = os.path.join(root, file_name) message_text = open(file_path, errors='ignore').read() # Compute fields for the From_ line in a traditional mbox message _from = re.search(r"From: ([^\r\n]+)", message_text).groups()[0] _date = re.search(r"Date: ([^\r\n]+)", message_text).groups()[0] # Convert _date to the asctime representation for the From_ line _date = asctime(parse(_date).timetuple()) msg = email.message_from_string(message_text) msg.set_unixfrom('From {0} {1}'.format(_from, _date)) mbox.write(msg.as_string(unixfrom=True) + "\n\n") mbox.close() # ## Loading the mailbox data into Pandas # In[ ]: import pandas as pd # pip install pandas import mailbox MBOX = 'resources/ch07-mailboxes/data/enron.mbox' mbox = mailbox.mbox(MBOX) mbox_dict = {} for i, msg in enumerate(mbox): mbox_dict[i] = {} for header in msg.keys(): mbox_dict[i][header] = msg[header] mbox_dict[i]['Body'] = msg.get_payload().replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').strip() df = pd.DataFrame.from_dict(mbox_dict, orient='index') # In[ ]: df.head() # In[ ]: df.index = df['Date'].apply(pd.to_datetime) # Remove non-essential columns cols_to_keep = ['From', 'To', 'Cc', 'Bcc', 'Subject', 'Body'] df = df[cols_to_keep] # In[ ]: df.head() # ## Describe the DataFrame # In[ ]: df.describe() # ## Investigate email volume by month # In[ ]: start_date = '2000-1-1' stop_date = '2003-1-1' datemask = (df.index > start_date) & (df.index <= stop_date) vol_by_month = df.loc[datemask].resample('1M').count()['To'] print(vol_by_month) # In[ ]: from prettytable import PrettyTable pt = PrettyTable(field_names=['Year', 'Month', 'Num Msgs']) pt.align['Num Msgs'], pt.align['Month'] = 'r', 'r' [ pt.add_row([ind.year, ind.month, vol]) for ind, vol in zip(vol_by_month.index, vol_by_month)] print(pt) # In[ ]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') vol_by_month[::-1].plot(kind='barh', figsize=(5,8), title='Email Volume by Month') # ## Analyzing Patterns in Sender/Recipient Communications # In[ ]: senders = df['From'].unique() receivers = df['To'].unique() cc_receivers = df['Cc'].unique() bcc_receivers = df['Bcc'].unique() print('Num Senders:', len(senders)) print('Num Receivers:', len(receivers)) print('Num CC Receivers:', len(cc_receivers)) print('Num BCC Receivers:', len(bcc_receivers)) # In[ ]: senders = set(senders) receivers = set(receivers) cc_receivers = set(cc_receivers) bcc_receivers = set(bcc_receivers) # Find the number of senders who were also direct receivers senders_intersect_receivers = senders.intersection(receivers) # Find the senders that didn't receive any messages senders_diff_receivers = senders.difference(receivers) # Find the receivers that didn't send any messages receivers_diff_senders = receivers.difference(senders) # Find the senders who were any kind of receiver by # first computing the union of all types of receivers all_receivers = receivers.union(cc_receivers, bcc_receivers) senders_all_receivers = senders.intersection(all_receivers) print("Num senders in common with receivers:", len(senders_intersect_receivers)) print("Num senders who didn't receive:", len(senders_diff_receivers)) print("Num receivers who didn't send:", len(receivers_diff_senders)) print("Num senders in common with *all* receivers:", len(senders_all_receivers)) # ### Who is Sending and Receiving the Most Email? # In[ ]: import numpy as np top_senders = df.groupby('From') top_receivers = df.groupby('To') top_senders = top_senders.count()['To'] top_receivers = top_receivers.count()['From'] # Get the ordered indices of the top senders and receivers in descending order top_snd_ord = np.argsort(top_senders)[::-1] top_rcv_ord = np.argsort(top_receivers)[::-1] top_senders = top_senders[top_snd_ord] top_receivers = top_receivers[top_rcv_ord] # In[ ]: from prettytable import PrettyTable top10 = top_senders[:10] pt = PrettyTable(field_names=['Rank', 'Sender', 'Messages Sent']) pt.align['Messages Sent'] = 'r' [ pt.add_row([i+1, email, vol]) for i, email, vol in zip(range(10), top10.index.values, top10.values)] print(pt) # In[ ]: from prettytable import PrettyTable top10 = top_receivers[:10] pt = PrettyTable(field_names=['Rank', 'Receiver', 'Messages Received']) pt.align['Messages Sent'] = 'r' [ pt.add_row([i+1, email, vol]) for i, email, vol in zip(range(10), top10.index.values, top10.values)] print(pt) # ## Searching by keyword # In[ ]: import textwrap search_term = 'raptor' query = (df['Body'].str.contains(search_term, case=False) | df['Subject'].str.contains(search_term, case=False)) results = df[query] print('{0} results found.'.format(query.sum())) print('Printing first 10 results...') for i in range(10): subject, body = results.iloc[i]['Subject'], results.iloc[i]['Body'] print() print('SUBJECT: ', subject) print('-'*20) for line in textwrap.wrap(body, width=70, max_lines=5): print(line) # ## Accessing Your Gmail Programmatically # In[ ]: import httplib2 import os from apiclient import discovery from oauth2client import client from oauth2client import tools from oauth2client.file import Storage # If modifying these scopes, delete your previously saved credentials # at ~/.credentials/gmail-python-quickstart.json SCOPES = 'https://www.googleapis.com/auth/gmail.readonly' CLIENT_SECRET_FILE = 'client_secret.json' APPLICATION_NAME = 'Gmail API Python Quickstart' # In[ ]: def get_credentials(): """Gets valid user credentials from storage. If nothing has been stored, or if the stored credentials are invalid, the OAuth2 flow is completed to obtain the new credentials. Returns: Credentials, the obtained credential. """ home_dir = os.path.expanduser('~') credential_dir = os.path.join(home_dir, '.credentials') if not os.path.exists(credential_dir): os.makedirs(credential_dir) credential_path = os.path.join(credential_dir, 'gmail-python-quickstart.json') store = Storage(credential_path) credentials = store.get() if not credentials or credentials.invalid: flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES) flow.user_agent = APPLICATION_NAME if flags: credentials = tools.run_flow(flow, store, flags) else: # Needed only for compatibility with Python 2.6 credentials = tools.run(flow, store) print('Storing credentials to ' + credential_path) return credentials # In[ ]: credentials = get_credentials() http = credentials.authorize(httplib2.Http()) service = discovery.build('gmail', 'v1', http=http) results = service.users().labels().list(userId='me').execute() labels = results.get('labels', []) if not labels: print('No labels found.') else: print('Labels:') for label in labels: print(label['name']) # ## Fetch Gmail Messages # In[ ]: query = 'Mining' max_results = 10 # Search for Gmail messages containing the query term results = service.users().messages().list(userId='me', q=query, maxResults=max_results).execute() for result in results['messages']: print(result['id']) # Retrieve the message itself msg = service.users().messages().get(userId='me', id=result['id'], format='minimal').execute() print(msg)