#!/usr/bin/env python
# coding: utf-8

# # Mining the Social Web
# 
# ## Mining Mailboxes
# 
# This Jupyter Notebook provides an interactive way to follow along with and explore the examples from the video series. The intent behind this notebook is to reinforce the concepts in a fun, convenient, and effective way.

# ## Converting a toy mailbox to JSON

# In[ ]:


import mailbox # pip install mailbox
import json


# In[ ]:


MBOX = 'resources/ch07-mailboxes/data/northpole.mbox'


# In[ ]:


# A routine that makes a ton of simplifying assumptions
# about converting an mbox message into a Python object
# given the nature of the northpole.mbox file in order
# to demonstrate the basic parsing of an mbox with mail
# utilities

def objectify_message(msg):
    
    # Map in fields from the message
    o_msg = dict([ (k, v) for (k,v) in msg.items() ])
    
    # Assume one part to the message and get its content
    # and its content type
    
    part = [p for p in msg.walk()][0]
    o_msg['contentType'] = part.get_content_type()
    o_msg['content'] = part.get_payload()
    
    return o_msg


# In[ ]:


# Create an mbox that can be iterated over and transform each of its
# messages to a convenient JSON representation

mbox = mailbox.mbox(MBOX)

messages = []

for msg in mbox:
    messages.append(objectify_message(msg))
    
print(json.dumps(messages, indent=1))


# ## Downloading the Enron email corpus

# In[ ]:


import sys
from urllib.request import urlopen
import time
import os
import envoy # pip install envoy

URL = "http://www.cs.cmu.edu/~enron/enron_mail_20110402.tgz"
DOWNLOAD_DIR = "resources/ch07-mailboxes/data"

# Downloads a file and displays a download status every 5 seconds

def download(url, download_dir):    
    file_name = url.split('/')[-1]
    u = urlopen(url)
    f = open(os.path.join(download_dir, file_name), 'wb')
    meta = u.info()
    file_size = int(meta['Content-Length'])
    print("Downloading: %s Bytes: %s" % (file_name, file_size))

    file_size_dl = 0
    block_sz = 8192
    last_update = time.time()
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        file_size_dl += len(buffer)
        f.write(buffer)
        download_status = r"%10d MB  [%5.2f%%]" % (file_size_dl / 1000000.0, file_size_dl * 100.0 / file_size)
        download_status = download_status + chr(8)*(len(download_status)+1)
        if time.time() - last_update > 5:
            print(download_status)
            sys.stdout.flush()
            last_update = time.time()
    f.close()
    return f.name

# Extracts a gzipped tarfile. e.g. "$ tar xzf filename.tgz"

def tar_xzf(f):
    # Call out to the shell for a faster decompression.
    # This will still take a while because Vagrant synchronizes
    # thousands of files that are extracted to the host machine
    r = envoy.run("tar xzf %s -C %s" % (f, DOWNLOAD_DIR))
    print(r.std_out)
    print(r.std_err)

f = download(URL, DOWNLOAD_DIR)
print("Download complete: %s" % (f,))
tar_xzf(f)
print("Decompression complete")
print("Data is ready")


# ## Converting the Enron corpus to a standardized mbox format
# 
# The results of the sample code below have been saved as a file, `enron.mbox.bz2`, in a compressed format. You may decompress is to `enron.mbox` using whatever tool you prefer, appropriate to your computer's operating system. On UNIX-like systems, the file may be decompressed with the command:
# 
# `tar -xjf enron.mbox.bz2`

# In[ ]:


import re
import email
from time import asctime
import os
import sys
from dateutil.parser import parse # pip install python_dateutil

# XXX: Download the Enron corpus to resources/ch07-mailboxes/data
# and unarchive it there.

MAILDIR = 'resources/ch07-mailboxes/data/enron_mail_20110402/maildir' 

# Where to write the converted mbox
MBOX = 'resources/ch07-mailboxes/data/enron.mbox'

# Create a file handle that we'll be writing into...
mbox = open(MBOX, 'w+')

# Walk the directories and process any folder named 'inbox'

for (root, dirs, file_names) in os.walk(MAILDIR):

    if root.split(os.sep)[-1].lower() != 'inbox':
        continue

    # Process each message in 'inbox'

    for file_name in file_names:
        file_path = os.path.join(root, file_name)
        message_text = open(file_path, errors='ignore').read()

        # Compute fields for the From_ line in a traditional mbox message
        _from = re.search(r"From: ([^\r\n]+)", message_text).groups()[0]
        _date = re.search(r"Date: ([^\r\n]+)", message_text).groups()[0]

        # Convert _date to the asctime representation for the From_ line
        _date = asctime(parse(_date).timetuple())

        msg = email.message_from_string(message_text)
        msg.set_unixfrom('From {0} {1}'.format(_from, _date))

        mbox.write(msg.as_string(unixfrom=True) + "\n\n")
    
mbox.close()


# ## Loading the mailbox data into Pandas

# In[ ]:


import pandas as pd # pip install pandas
import mailbox

MBOX = 'resources/ch07-mailboxes/data/enron.mbox'
mbox = mailbox.mbox(MBOX)

mbox_dict = {}
for i, msg in enumerate(mbox):
    mbox_dict[i] = {}
    for header in msg.keys():
        mbox_dict[i][header] = msg[header]
    mbox_dict[i]['Body'] = msg.get_payload().replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').strip()
    
df = pd.DataFrame.from_dict(mbox_dict, orient='index')


# In[ ]:


df.head()


# In[ ]:


df.index = df['Date'].apply(pd.to_datetime)

# Remove non-essential columns
cols_to_keep = ['From', 'To', 'Cc', 'Bcc', 'Subject', 'Body']
df = df[cols_to_keep]


# In[ ]:


df.head()


# ## Describe the DataFrame

# In[ ]:


df.describe()


# ## Investigate email volume by month

# In[ ]:


start_date = '2000-1-1'
stop_date = '2003-1-1'

# Make the dataframe index a datetime index
df.index = pd.to_datetime(df.index, utc=True)

datemask = (df.index > start_date) & (df.index <= stop_date)
vol_by_month = df.loc[datemask].resample('1M').count()['To']

print(vol_by_month)


# In[ ]:


from prettytable import PrettyTable

pt = PrettyTable(field_names=['Year', 'Month', 'Num Msgs'])
pt.align['Num Msgs'], pt.align['Month'] = 'r', 'r'
[ pt.add_row([ind.year, ind.month, vol])
  for ind, vol in zip(vol_by_month.index, vol_by_month)]

print(pt)


# In[ ]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

vol_by_month[::-1].plot(kind='barh', figsize=(5,8), title='Email Volume by Month')


# ## Analyzing Patterns in Sender/Recipient Communications

# In[ ]:


senders = df['From'].unique()
receivers = df['To'].unique()
cc_receivers = df['Cc'].unique()
bcc_receivers = df['Bcc'].unique()

print('Num Senders:', len(senders))
print('Num Receivers:', len(receivers))
print('Num CC Receivers:', len(cc_receivers))
print('Num BCC Receivers:', len(bcc_receivers))


# In[ ]:


senders = set(senders)
receivers = set(receivers)
cc_receivers = set(cc_receivers)
bcc_receivers = set(bcc_receivers)

# Find the number of senders who were also direct receivers

senders_intersect_receivers = senders.intersection(receivers)

# Find the senders that didn't receive any messages

senders_diff_receivers = senders.difference(receivers)
                                           
# Find the receivers that didn't send any messages

receivers_diff_senders = receivers.difference(senders)

# Find the senders who were any kind of receiver by
# first computing the union of all types of receivers

all_receivers = receivers.union(cc_receivers, bcc_receivers)
senders_all_receivers = senders.intersection(all_receivers)

print("Num senders in common with receivers:", len(senders_intersect_receivers))
print("Num senders who didn't receive:", len(senders_diff_receivers))
print("Num receivers who didn't send:", len(receivers_diff_senders))
print("Num senders in common with *all* receivers:", len(senders_all_receivers))


# ### Who is Sending and Receiving the Most Email?

# In[ ]:


import numpy as np

top_senders = df.groupby('From')
top_receivers = df.groupby('To')

top_senders = top_senders.count()['To']
top_receivers = top_receivers.count()['From']

# Get the ordered indices of the top senders and receivers in descending order
top_snd_ord = np.argsort(top_senders)[::-1]
top_rcv_ord = np.argsort(top_receivers)[::-1]

top_senders = top_senders[top_snd_ord]
top_receivers = top_receivers[top_rcv_ord]


# In[ ]:


from prettytable import PrettyTable

top10 = top_senders[:10]
pt = PrettyTable(field_names=['Rank', 'Sender', 'Messages Sent'])
pt.align['Messages Sent'] = 'r'
[ pt.add_row([i+1, email, vol]) for i, email, vol in zip(range(10), top10.index.values, top10.values)]

print(pt)


# In[ ]:


from prettytable import PrettyTable

top10 = top_receivers[:10]
pt = PrettyTable(field_names=['Rank', 'Receiver', 'Messages Received'])
pt.align['Messages Sent'] = 'r'
[ pt.add_row([i+1, email, vol]) for i, email, vol in zip(range(10), top10.index.values, top10.values)]

print(pt)


# ## Searching by keyword

# In[ ]:


import textwrap

search_term = 'raptor'

query = (df['Body'].str.contains(search_term, case=False) | df['Subject'].str.contains(search_term, case=False))

results = df[query]

print('{0} results found.'.format(query.sum()))
print('Printing first 10 results...')
for i in range(10):
    subject, body = results.iloc[i]['Subject'], results.iloc[i]['Body']
    print()
    print('SUBJECT: ', subject)
    print('-'*20)
    for line in textwrap.wrap(body, width=70, max_lines=5):
        print(line)


# ## Accessing Your Gmail Programmatically
# 
# Accessing your own Gmail data involves a few steps:
# 
# 1. Use the Google Developer Console to create or select a project. Turn on the Gmail API.
# 2. Select the Credentials tab, click “Create credentials,” and select “OAuth client ID.”
# 3. Select the application type Other, enter the name “Gmail API Quickstart,” and click the Create button.
# 4. Click OK to dismiss the resulting dialog.
# 5. Click the file download button next to your newly created credentials to download a JSON file containing them.
# 6. Move this file to your working directory and rename it client_secret.json.

# In[ ]:


import httplib2
import os

from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage

# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/gmail-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/gmail.readonly'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Gmail API Python Quickstart'


# In[ ]:


def get_credentials():
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir,
                                   'gmail-python-quickstart.json')

    store = Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials


# In[ ]:


credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('gmail', 'v1', http=http)

results = service.users().labels().list(userId='me').execute()
labels = results.get('labels', [])

if not labels:
    print('No labels found.')
else:
    print('Labels:')
    for label in labels:
        print(label['name'])


# ## Fetch Gmail Messages

# In[ ]:


query = 'Mining'
max_results = 10

# Search for Gmail messages containing the query term
results = service.users().messages().list(userId='me', q=query, maxResults=max_results).execute()

for result in results['messages']:
    print(result['id'])
    # Retrieve the message itself
    msg = service.users().messages().get(userId='me', id=result['id'], format='minimal').execute()
    print(msg)