#!/usr/bin/env python
# coding: utf-8

# # Mining the Social Web
# 
# ## Mining GitHub
# 
# This Jupyter Notebook provides an interactive way to follow along with and explore the examples from the video series. The intent behind this notebook is to reinforce the concepts in a fun, convenient, and effective way.

# ## Programmatically obtaining a personal API access token for accessing GitHub's API

# In[ ]:


import requests
import json

username = '' # Your GitHub username
password = '' # Your GitHub password

# Note that credentials will be transmitted over a secure SSL connection
url = 'https://api.github.com/authorizations'
note = 'Mining the Social Web - Mining Github'
post_data = {'scopes':['repo'],'note': note }

response = requests.post(
    url,
    auth = (username, password),
    data = json.dumps(post_data),
    )   

print("API response:", response.text)
print()
print("Your OAuth token is", response.json()['token'])

# Go to https://github.com/settings/tokens to revoke this token


# ## Making direct HTTP requests to GitHub's API

# In[ ]:


import json
import requests

# An unauthenticated request that doesn't contain an ?access_token=xxx query string
url = "https://api.github.com/repos/ptwobrussell/Mining-the-Social-Web/stargazers"
response = requests.get(url)

# Display one stargazer
print(json.dumps(response.json()[0], indent=1))
print()

# Display headers
for (k,v) in response.headers.items():
    print(k, "=>", v)


# ## Using PyGithub to query for stargazers of a particular repository

# In[ ]:


from github import Github # pip install pygithub

# XXX: Specify your own access token here

ACCESS_TOKEN = ''

# Specify a username and repository of interest for that user.

USER = 'ptwobrussell'
REPO = 'Mining-the-Social-Web'
#REPO = 'Mining-the-Social-Web-2nd-Edition'

client = Github(ACCESS_TOKEN, per_page=100)
user = client.get_user(USER)
repo = user.get_repo(REPO)

# Get a list of people who have bookmarked the repo.
# Since you'll get a lazy iterator back, you have to traverse
# it if you want to get the total number of stargazers.

stargazers = [ s for s in repo.get_stargazers() ]
print("Number of stargazers", len(stargazers))


# ## Constructing a trivial property graph

# In[ ]:


import networkx as nx # pip install networkx

# Create a directed graph

g = nx.DiGraph()

# Add an edge to the directed graph from X to Y

g.add_edge('X', 'Y')

# Print some statistics about the graph

print(nx.info(g))


# In[ ]:


# Get the nodes and edges from the graph

print("Nodes:", g.nodes())
print("Edges:", g.edges())
print()

# Get node properties

print("X props:", g.node['X'])
print("Y props:", g.node['Y'])
print()

# Get edge properties

print("X=>Y props:", g['X']['Y'])
print()


# In[ ]:


# Update a node property

g.node['X'].update({'prop1' : 'value1'})
print("X props:", g.node['X'])
print()

# Update an edge property

g['X']['Y'].update({'label' : 'label1'})
print("X=>Y props:", g['X']['Y'])


# ## Constructing an ego graph of a repository and its stargazers

# In[ ]:


# Expand the initial graph with (interest) edges pointing each direction for 
# additional people interested. Take care to ensure that user and repo nodes 
# do not collide by appending their type.

g = nx.DiGraph()
g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)

for sg in stargazers:
    g.add_node(sg.login + '(user)', type='user')
    g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')


# ## Introducing some handy graph operations
# 
# Poke around in the current graph to get a better feel for how NetworkX works.

# In[ ]:


print(nx.info(g))


# In[ ]:


print(g.node['Mining-the-Social-Web(repo)'])


# In[ ]:


print(g.node['ptwobrussell(user)'])

print(g['ptwobrussell(user)']['Mining-the-Social-Web(repo)'])


# In[ ]:


print(g['ptwobrussell(user)'])
print(g['Mining-the-Social-Web(repo)'])


# In[ ]:


print(g.in_edges(['ptwobrussell(user)']))
print(g.out_edges(['ptwobrussell(user)']))


# In[ ]:


print(g.in_edges(['Mining-the-Social-Web(repo)']))
print(g.out_edges(['Mining-the-Social-Web(repo)']))


# ## Calculating degree, betweenness, and closeness centrality measures on the Krackhardt kite graph

# In[ ]:


from operator import itemgetter
from IPython.display import HTML
from IPython.core.display import display

display(HTML('<img src="resources/ch08-github/kite-graph.png" width="400px">'))

# The classic Krackhardt kite graph
kkg = nx.generators.small.krackhardt_kite_graph()

print("Degree Centrality")
print(sorted(nx.degree_centrality(kkg).items(), 
             key=itemgetter(1), reverse=True))
print()

print("Betweenness Centrality")
print(sorted(nx.betweenness_centrality(kkg).items(), 
             key=itemgetter(1), reverse=True))
print()

print("Closeness Centrality")
print(sorted(nx.closeness_centrality(kkg).items(), 
             key=itemgetter(1), reverse=True))


# ## Adding additional interest edges to the graph through the inclusion of "follows" edges

# In[ ]:


# Add (social) edges from the stargazers' followers. This can take a while 
# because of all of the potential API calls to GitHub. The approximate number
# of requests for followers for each iteration of this loop can be calculated as
# math.ceil(sg.get_followers() / 100.0) per the API returning up to 100 items
# at a time.

import sys

for i, sg in enumerate(stargazers):
    
    # Add "follows" edges between stargazers in the graph if any relationships exist
    try:
        for follower in sg.get_followers():
            if follower.login + '(user)' in g:
                g.add_edge(follower.login + '(user)', sg.login + '(user)', 
                           type='follows')
    except Exception as e: #ssl.SSLError
        print("Encountered an error fetching followers for", sg.login, \
              "Skipping.", file=sys.stderr)
        print(e, file=sys.stderr)

    print("Processed", i+1, " stargazers. Num nodes/edges in graph", \
          g.number_of_nodes(), "/", g.number_of_edges())
    print("Rate limit remaining", client.rate_limiting)


# ## Exploring the updated graph's "follows" edges

# In[ ]:


from operator import itemgetter
from collections import Counter

# Let's see how many social edges we added since last time.
print(nx.info(g))


# In[ ]:


# The number of "follows" edges is the difference
print(len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows']))


# In[ ]:


# The repository owner is possibly one of the more popular users in this graph.
print(len([e 
           for e in g.edges_iter(data=True) 
               if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)']))


# In[ ]:


# Let's examine the number of adjacent edges to each node
print(sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10])


# In[ ]:


# Consider the ratio of incoming and outgoing edges for a couple of users with 
# high node degrees...

print(len(g.out_edges('angusshire(user)')))
print(len(g.in_edges('angusshire(user)')))


# In[ ]:


# A user who is followed by many but does not follow back.

print(len(g.out_edges('ptwobrussell(user)')))
print(len(g.in_edges('ptwobrussell(user)')))


# In[ ]:


c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ]
print("Number of popular users", len(popular_users))
print("Top 10 popular users:", popular_users[:10])


# ## Snapshotting (pickling) the graph's state to disk

# In[ ]:


# Save your work by serializing out (pickling) the graph
nx.write_gpickle(g, "data/github.gpickle.1")

# How to restore the graph...
# import networkx as nx
#  g = nx.read_gpickle("data/github.gpickle.1")


# ## Applying centrality measures to the interest graph

# In[ ]:


from operator import itemgetter

# Create a copy of the graph so that we can iteratively mutate the copy
# as needed for experimentation

h = g.copy()

# Remove the seed of the interest graph, which is a supernode, in order
# to get a better idea of the network dynamics

h.remove_node('Mining-the-Social-Web(repo)')

# XXX: Remove any other nodes that appear to be supernodes.
# Filter any other nodes that you can by threshold
# criteria or heuristics from inspection.

# Display the centrality measures for the top 10 nodes


dc = sorted(nx.degree_centrality(h).items(), 
            key=itemgetter(1), reverse=True)

print("Degree Centrality")
print(dc[:10])
print()

bc = sorted(nx.betweenness_centrality(h).items(), 
            key=itemgetter(1), reverse=True)

print("Betweenness Centrality")
print(bc[:10])
print()

print("Closeness Centrality")
cc = sorted(nx.closeness_centrality(h).items(), 
            key=itemgetter(1), reverse=True)
print(cc[:10])


# ## Adding starred repositories to the graph

# In[ ]:


# Let's add each stargazer's additional starred repos and add edges 
# to find additional interests.

MAX_REPOS = 500

for i, sg in enumerate(stargazers):
    print(sg.login)
    try:
        for starred in sg.get_starred()[:MAX_REPOS]: # Slice to avoid supernodes
            g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, \
                       owner=starred.owner.login)
            g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')
    except Exception as e: #ssl.SSLError:
        print("Encountered an error fetching starred repos for", sg.login, "Skipping.")

    print("Processed", i+1, "stargazers' starred repos")
    print("Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges())
    print("Rate limit", client.rate_limiting)


# **NOTE: Given that the above example is potentially a very time-consuming example to run, be sure to snapshot your work**

# In[ ]:


# Save your work by serializing out another snapshot of the graph
nx.write_gpickle(g, "data/github.gpickle.2")

#import networkx as nx
# g = nx.read_gpickle("data/github.gpickle.2")


# ## Exploring the graph after updates with additional starred repositories

# In[ ]:


# Poke around: how to get users/repos
from operator import itemgetter

print(nx.info(g))
print()

# Get a list of repositories from the graph.

repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']

# Most popular repos

print("Popular repositories")
print(sorted([(n,d) 
              for (n,d) in g.in_degree_iter() 
                  if g.node[n]['type'] == 'repo'], \
             key=itemgetter(1), reverse=True)[:10])
print()

# Projects gazed at by a user

print("Respositories that ptwobrussell has bookmarked")
print([(n,g.node[n]['lang']) 
       for n in g['ptwobrussell(user)'] 
           if g['ptwobrussell(user)'][n]['type'] == 'gazes'])
print()

# Programming languages for each user

print("Programming languages ptwobrussell is interested in")
print(list(set([g.node[n]['lang'] 
                for n in g['ptwobrussell(user)'] 
                    if g['ptwobrussell(user)'][n]['type'] == 'gazes'])))
print()

# Find supernodes in the graph by approximating with a high number of 
# outgoing edges

print("Supernode candidates")
print(sorted([(n, len(g.out_edges(n))) 
              for n in g.nodes_iter() 
                  if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], \
             key=itemgetter(1), reverse=True))


# ## Updating the graph to include nodes for programming languages

# In[ ]:


# Iterate over all of the repos, and add edges for programming languages 
# for each person in the graph. We'll also add edges back to repos so that 
# we have a good point to "pivot" upon.

repos = [n 
         for n in g.nodes_iter() 
             if g.node[n]['type'] == 'repo']

for repo in repos:
    lang = (g.node[repo]['lang'] or "") + "(lang)"
    
    stargazers = [u 
                  for (u, r, d) in g.in_edges_iter(repo, data=True) 
                     if d['type'] == 'gazes'
                 ]
    
    for sg in stargazers:
        g.add_node(lang, type='lang')
        g.add_edge(sg, lang, type='programs')
        g.add_edge(lang, repo, type='implements')


# ## Sample queries for the final graph

# In[ ]:


# Some stats

print(nx.info(g))
print()

# What languages exist in the graph?

print([n 
       for n in g.nodes_iter() 
           if g.node[n]['type'] == 'lang'])
print()

# What languages do users program with?
print([n 
       for n in g['ptwobrussell(user)'] 
           if g['ptwobrussell(user)'][n]['type'] == 'programs'])

print()

# What is the most popular programming language?
print("Most popular languages")
print(sorted([(n, g.in_degree(n))
 for n in g.nodes_iter() 
     if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10])
print()

# How many users program in a particular language?
python_programmers = [u 
                      for (u, l) in g.in_edges_iter('Python(lang)') 
                          if g.node[u]['type'] == 'user']
print("Number of Python programmers:", len(python_programmers))
print()

javascript_programmers = [u for 
                          (u, l) in g.in_edges_iter('JavaScript(lang)') 
                              if g.node[u]['type'] == 'user']
print("Number of JavaScript programmers:", len(javascript_programmers))
print()

# What users program in both Python and JavaScript?
print("Number of programmers who use JavaScript and Python")
print(len(set(python_programmers).intersection(set(javascript_programmers))))

# Programmers who use JavaScript but not Python
print("Number of programmers who use JavaScript but not Python")
print(len(set(javascript_programmers).difference(set(python_programmers))))

# XXX: Can you determine who is the most polyglot programmer?


# **NOTE: Optionally, snapshot the final graph**

# In[ ]:


# Save your work by serializing out another snapshot of the graph
nx.write_gpickle(g, "data/github.gpickle.3")

#import networkx as nx
# g = nx.read_gpickle("data/github.gpickle.3")


# ## Graph visualization of the social network for the original interest graph

# In[ ]:


print("Stats on the full graph")
print(nx.info(g))
print()

# Create a subgraph from a collection of nodes. In this case, the
# collection is all of the users in the original interest graph

mtsw_users = [n for n in g if g.node[n]['type'] == 'user']
h = g.subgraph(mtsw_users)

print("Stats on the extracted subgraph")
print(nx.info(h))


# In[ ]:


import os
import json
from IPython.display import IFrame
from IPython.core.display import display
from networkx.readwrite import json_graph

# Visualize the social network of all people from the original interest graph.
d = json_graph.node_link_data(h)
json.dump(d, open('force.json', 'w'))


# IPython Notebook can serve files and display them into
# inline frames. Prepend the path with the 'files' prefix.

# A D3 template for displaying the graph data.
viz_file = 'force.html'

# Display the D3 visualization.

display(IFrame(viz_file, '100%', '500px'))


# ## Using Matplotlib and NetworkX to create graph visualizations

# In[ ]:


import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
get_ipython().run_line_magic('matplotlib', 'inline')


fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
labels = dict([(n, n.split('(user)')[0]) for n in h.nodes_iter()])

nx.draw(h, pos=nx.spring_layout(h), 
        arrows=False, ax=ax, node_size=50,
        edge_color='#aaaaaa',
        alpha=0.8, labels=labels, font_size=8)