#!/usr/bin/env python # coding: utf-8 # # Mining the Social Web # # ## Mining GitHub # # This Jupyter Notebook provides an interactive way to follow along with and explore the examples from the video series. The intent behind this notebook is to reinforce the concepts in a fun, convenient, and effective way. # ## Programmatically obtaining a personal API access token for accessing GitHub's API # In[ ]: import requests import json username = '' # Your GitHub username password = '' # Your GitHub password # Note that credentials will be transmitted over a secure SSL connection url = 'https://api.github.com/authorizations' note = 'Mining the Social Web - Mining Github' post_data = {'scopes':['repo'],'note': note } response = requests.post( url, auth = (username, password), data = json.dumps(post_data), ) print("API response:", response.text) print() print("Your OAuth token is", response.json()['token']) # Go to https://github.com/settings/tokens to revoke this token # ## Making direct HTTP requests to GitHub's API # In[ ]: import json import requests # An unauthenticated request that doesn't contain an ?access_token=xxx query string url = "https://api.github.com/repos/ptwobrussell/Mining-the-Social-Web/stargazers" response = requests.get(url) # Display one stargazer print(json.dumps(response.json()[0], indent=1)) print() # Display headers for (k,v) in response.headers.items(): print(k, "=>", v) # ## Using PyGithub to query for stargazers of a particular repository # In[ ]: from github import Github # pip install pygithub # XXX: Specify your own access token here ACCESS_TOKEN = '' # Specify a username and repository of interest for that user. USER = 'ptwobrussell' REPO = 'Mining-the-Social-Web' #REPO = 'Mining-the-Social-Web-2nd-Edition' client = Github(ACCESS_TOKEN, per_page=100) user = client.get_user(USER) repo = user.get_repo(REPO) # Get a list of people who have bookmarked the repo. # Since you'll get a lazy iterator back, you have to traverse # it if you want to get the total number of stargazers. stargazers = [ s for s in repo.get_stargazers() ] print("Number of stargazers", len(stargazers)) # ## Constructing a trivial property graph # In[ ]: import networkx as nx # pip install networkx # Create a directed graph g = nx.DiGraph() # Add an edge to the directed graph from X to Y g.add_edge('X', 'Y') # Print some statistics about the graph print(nx.info(g)) # In[ ]: # Get the nodes and edges from the graph print("Nodes:", g.nodes()) print("Edges:", g.edges()) print() # Get node properties print("X props:", g.node['X']) print("Y props:", g.node['Y']) print() # Get edge properties print("X=>Y props:", g['X']['Y']) print() # In[ ]: # Update a node property g.node['X'].update({'prop1' : 'value1'}) print("X props:", g.node['X']) print() # Update an edge property g['X']['Y'].update({'label' : 'label1'}) print("X=>Y props:", g['X']['Y']) # ## Constructing an ego graph of a repository and its stargazers # In[ ]: # Expand the initial graph with (interest) edges pointing each direction for # additional people interested. Take care to ensure that user and repo nodes # do not collide by appending their type. g = nx.DiGraph() g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login) for sg in stargazers: g.add_node(sg.login + '(user)', type='user') g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes') # ## Introducing some handy graph operations # # Poke around in the current graph to get a better feel for how NetworkX works. # In[ ]: print(nx.info(g)) # In[ ]: print(g.node['Mining-the-Social-Web(repo)']) # In[ ]: print(g.node['ptwobrussell(user)']) print(g['ptwobrussell(user)']['Mining-the-Social-Web(repo)']) # In[ ]: print(g['ptwobrussell(user)']) print(g['Mining-the-Social-Web(repo)']) # In[ ]: print(g.in_edges(['ptwobrussell(user)'])) print(g.out_edges(['ptwobrussell(user)'])) # In[ ]: print(g.in_edges(['Mining-the-Social-Web(repo)'])) print(g.out_edges(['Mining-the-Social-Web(repo)'])) # ## Calculating degree, betweenness, and closeness centrality measures on the Krackhardt kite graph # In[ ]: from operator import itemgetter from IPython.display import HTML from IPython.core.display import display display(HTML('')) # The classic Krackhardt kite graph kkg = nx.generators.small.krackhardt_kite_graph() print("Degree Centrality") print(sorted(nx.degree_centrality(kkg).items(), key=itemgetter(1), reverse=True)) print() print("Betweenness Centrality") print(sorted(nx.betweenness_centrality(kkg).items(), key=itemgetter(1), reverse=True)) print() print("Closeness Centrality") print(sorted(nx.closeness_centrality(kkg).items(), key=itemgetter(1), reverse=True)) # ## Adding additional interest edges to the graph through the inclusion of "follows" edges # In[ ]: # Add (social) edges from the stargazers' followers. This can take a while # because of all of the potential API calls to GitHub. The approximate number # of requests for followers for each iteration of this loop can be calculated as # math.ceil(sg.get_followers() / 100.0) per the API returning up to 100 items # at a time. import sys for i, sg in enumerate(stargazers): # Add "follows" edges between stargazers in the graph if any relationships exist try: for follower in sg.get_followers(): if follower.login + '(user)' in g: g.add_edge(follower.login + '(user)', sg.login + '(user)', type='follows') except Exception as e: #ssl.SSLError print("Encountered an error fetching followers for", sg.login, \ "Skipping.", file=sys.stderr) print(e, file=sys.stderr) print("Processed", i+1, " stargazers. Num nodes/edges in graph", \ g.number_of_nodes(), "/", g.number_of_edges()) print("Rate limit remaining", client.rate_limiting) # ## Exploring the updated graph's "follows" edges # In[ ]: from operator import itemgetter from collections import Counter # Let's see how many social edges we added since last time. print(nx.info(g)) # In[ ]: # The number of "follows" edges is the difference print(len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])) # In[ ]: # The repository owner is possibly one of the more popular users in this graph. print(len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)'])) # In[ ]: # Let's examine the number of adjacent edges to each node print(sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10]) # In[ ]: # Consider the ratio of incoming and outgoing edges for a couple of users with # high node degrees... print(len(g.out_edges('angusshire(user)'))) print(len(g.in_edges('angusshire(user)'))) # In[ ]: # A user who is followed by many but does not follow back. print(len(g.out_edges('ptwobrussell(user)'))) print(len(g.in_edges('ptwobrussell(user)'))) # In[ ]: c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows']) popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ] print("Number of popular users", len(popular_users)) print("Top 10 popular users:", popular_users[:10]) # ## Snapshotting (pickling) the graph's state to disk # In[ ]: # Save your work by serializing out (pickling) the graph nx.write_gpickle(g, "data/github.gpickle.1") # How to restore the graph... # import networkx as nx # g = nx.read_gpickle("data/github.gpickle.1") # ## Applying centrality measures to the interest graph # In[ ]: from operator import itemgetter # Create a copy of the graph so that we can iteratively mutate the copy # as needed for experimentation h = g.copy() # Remove the seed of the interest graph, which is a supernode, in order # to get a better idea of the network dynamics h.remove_node('Mining-the-Social-Web(repo)') # XXX: Remove any other nodes that appear to be supernodes. # Filter any other nodes that you can by threshold # criteria or heuristics from inspection. # Display the centrality measures for the top 10 nodes dc = sorted(nx.degree_centrality(h).items(), key=itemgetter(1), reverse=True) print("Degree Centrality") print(dc[:10]) print() bc = sorted(nx.betweenness_centrality(h).items(), key=itemgetter(1), reverse=True) print("Betweenness Centrality") print(bc[:10]) print() print("Closeness Centrality") cc = sorted(nx.closeness_centrality(h).items(), key=itemgetter(1), reverse=True) print(cc[:10]) # ## Adding starred repositories to the graph # In[ ]: # Let's add each stargazer's additional starred repos and add edges # to find additional interests. MAX_REPOS = 500 for i, sg in enumerate(stargazers): print(sg.login) try: for starred in sg.get_starred()[:MAX_REPOS]: # Slice to avoid supernodes g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, \ owner=starred.owner.login) g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes') except Exception as e: #ssl.SSLError: print("Encountered an error fetching starred repos for", sg.login, "Skipping.") print("Processed", i+1, "stargazers' starred repos") print("Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()) print("Rate limit", client.rate_limiting) # **NOTE: Given that the above example is potentially a very time-consuming example to run, be sure to snapshot your work** # In[ ]: # Save your work by serializing out another snapshot of the graph nx.write_gpickle(g, "data/github.gpickle.2") #import networkx as nx # g = nx.read_gpickle("data/github.gpickle.2") # ## Exploring the graph after updates with additional starred repositories # In[ ]: # Poke around: how to get users/repos from operator import itemgetter print(nx.info(g)) print() # Get a list of repositories from the graph. repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo'] # Most popular repos print("Popular repositories") print(sorted([(n,d) for (n,d) in g.in_degree_iter() if g.node[n]['type'] == 'repo'], \ key=itemgetter(1), reverse=True)[:10]) print() # Projects gazed at by a user print("Respositories that ptwobrussell has bookmarked") print([(n,g.node[n]['lang']) for n in g['ptwobrussell(user)'] if g['ptwobrussell(user)'][n]['type'] == 'gazes']) print() # Programming languages for each user print("Programming languages ptwobrussell is interested in") print(list(set([g.node[n]['lang'] for n in g['ptwobrussell(user)'] if g['ptwobrussell(user)'][n]['type'] == 'gazes']))) print() # Find supernodes in the graph by approximating with a high number of # outgoing edges print("Supernode candidates") print(sorted([(n, len(g.out_edges(n))) for n in g.nodes_iter() if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], \ key=itemgetter(1), reverse=True)) # ## Updating the graph to include nodes for programming languages # In[ ]: # Iterate over all of the repos, and add edges for programming languages # for each person in the graph. We'll also add edges back to repos so that # we have a good point to "pivot" upon. repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo'] for repo in repos: lang = (g.node[repo]['lang'] or "") + "(lang)" stargazers = [u for (u, r, d) in g.in_edges_iter(repo, data=True) if d['type'] == 'gazes' ] for sg in stargazers: g.add_node(lang, type='lang') g.add_edge(sg, lang, type='programs') g.add_edge(lang, repo, type='implements') # ## Sample queries for the final graph # In[ ]: # Some stats print(nx.info(g)) print() # What languages exist in the graph? print([n for n in g.nodes_iter() if g.node[n]['type'] == 'lang']) print() # What languages do users program with? print([n for n in g['ptwobrussell(user)'] if g['ptwobrussell(user)'][n]['type'] == 'programs']) print() # What is the most popular programming language? print("Most popular languages") print(sorted([(n, g.in_degree(n)) for n in g.nodes_iter() if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10]) print() # How many users program in a particular language? python_programmers = [u for (u, l) in g.in_edges_iter('Python(lang)') if g.node[u]['type'] == 'user'] print("Number of Python programmers:", len(python_programmers)) print() javascript_programmers = [u for (u, l) in g.in_edges_iter('JavaScript(lang)') if g.node[u]['type'] == 'user'] print("Number of JavaScript programmers:", len(javascript_programmers)) print() # What users program in both Python and JavaScript? print("Number of programmers who use JavaScript and Python") print(len(set(python_programmers).intersection(set(javascript_programmers)))) # Programmers who use JavaScript but not Python print("Number of programmers who use JavaScript but not Python") print(len(set(javascript_programmers).difference(set(python_programmers)))) # XXX: Can you determine who is the most polyglot programmer? # **NOTE: Optionally, snapshot the final graph** # In[ ]: # Save your work by serializing out another snapshot of the graph nx.write_gpickle(g, "data/github.gpickle.3") #import networkx as nx # g = nx.read_gpickle("data/github.gpickle.3") # ## Graph visualization of the social network for the original interest graph # In[ ]: print("Stats on the full graph") print(nx.info(g)) print() # Create a subgraph from a collection of nodes. In this case, the # collection is all of the users in the original interest graph mtsw_users = [n for n in g if g.node[n]['type'] == 'user'] h = g.subgraph(mtsw_users) print("Stats on the extracted subgraph") print(nx.info(h)) # In[ ]: import os import json from IPython.display import IFrame from IPython.core.display import display from networkx.readwrite import json_graph # Visualize the social network of all people from the original interest graph. d = json_graph.node_link_data(h) json.dump(d, open('force.json', 'w')) # IPython Notebook can serve files and display them into # inline frames. Prepend the path with the 'files' prefix. # A D3 template for displaying the graph data. viz_file = 'force.html' # Display the D3 visualization. display(IFrame(viz_file, '100%', '500px')) # ## Using Matplotlib and NetworkX to create graph visualizations # In[ ]: import matplotlib.pyplot as plt import warnings warnings.filterwarnings("ignore") get_ipython().run_line_magic('matplotlib', 'inline') fig = plt.figure(figsize=(15,15)) ax = fig.add_subplot(111) labels = dict([(n, n.split('(user)')[0]) for n in h.nodes_iter()]) nx.draw(h, pos=nx.spring_layout(h), arrows=False, ax=ax, node_size=50, edge_color='#aaaaaa', alpha=0.8, labels=labels, font_size=8)