#!/usr/bin/env python # coding: utf-8 # # Mining the Social Web # # ## Mining Web Pages # # This Jupyter Notebook provides an interactive way to follow along with and explore the examples from the video series. The intent behind this notebook is to reinforce the concepts in a fun, convenient, and effective way. # In[ ]: # Downloading nltk packages used in this example nltk.download('maxent_ne_chunker') nltk.download('words') ne_chunks = list(nltk.chunk.ne_chunk_sents(pos_tagged_tokens)) print(ne_chunks) ne_chunks[0].pprint() # ## Using boilerpipe to extract the text from a web page # # Example blog post: # http://radar.oreilly.com/2010/07/louvre-industrial-age-henry-ford.html # In[ ]: # May also require the installation of Java runtime libraries # pip install boilerpipe3 from boilerpipe.extract import Extractor # If you're interested, learn more about how Boilerpipe works by reading # Christian Kohlschütter's paper: http://www.l3s.de/~kohlschuetter/boilerplate/ URL='https://www.oreilly.com/ideas/ethics-in-data-project-design-its-about-planning' extractor = Extractor(extractor='ArticleExtractor', url=URL) print(extractor.getText()) # ## Using feedparser to extract the text (and other fields) from an RSS or Atom feed # In[ ]: import feedparser # pip install feedparser FEED_URL='http://feeds.feedburner.com/oreilly/radar/atom' fp = feedparser.parse(FEED_URL) for e in fp.entries: print(e.title) print(e.links[0].href) print(e.content[0].value) # ## Harvesting blog data by parsing feeds # In[ ]: import os import sys import json import feedparser from bs4 import BeautifulSoup from nltk import clean_html FEED_URL = 'http://feeds.feedburner.com/oreilly/radar/atom' def cleanHtml(html): if html == "": return "" return BeautifulSoup(html, 'html5lib').get_text() fp = feedparser.parse(FEED_URL) print("Fetched {0} entries from '{1}'".format(len(fp.entries[0].title), fp.feed.title)) blog_posts = [] for e in fp.entries: blog_posts.append({'title': e.title, 'content' : cleanHtml(e.content[0].value), 'link': e.links[0].href}) out_file = os.path.join('feed.json') f = open(out_file, 'w+') f.write(json.dumps(blog_posts, indent=1)) f.close() print('Wrote output file to {0}'.format(f.name)) # ## Starting to write a web crawler # In[ ]: import httplib2 import re from bs4 import BeautifulSoup http = httplib2.Http() status, response = http.request('http://www.nytimes.com') soup = BeautifulSoup(response, 'html5lib') links = [] for link in soup.findAll('a', attrs={'href': re.compile("^http(s?)://")}): links.append(link.get('href')) for link in links: print(link) # ``` # Create an empty graph # Create an empty queue to keep track of nodes that need to be processed # # Add the starting point to the graph as the root node # Add the root node to a queue for processing # # Repeat until some maximum depth is reached or the queue is empty: # Remove a node from the queue # For each of the node's neighbors: # If the neighbor hasn't already been processed: # Add it to the queue # Add it to the graph # Create an edge in the graph that connects the node and its neighbor # ``` # ## Using NLTK to parse web page data # **Naive sentence detection based on periods** # In[ ]: text = "Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow." print(text.split(".")) # **More sophisticated sentence detection** # In[ ]: import nltk # Installation instructions: http://www.nltk.org/install.html # Downloading nltk packages used in this example nltk.download('punkt') # In[ ]: sentences = nltk.tokenize.sent_tokenize(text) print(sentences) # In[ ]: harder_example = """My name is John Smith and my email address is j.smith@company.com. Mostly people call Mr. Smith. But I actually have a Ph.D.! Can you believe it? Neither can most people...""" sentences = nltk.tokenize.sent_tokenize(harder_example) print(sentences) # **Word tokenization** # In[ ]: text = "Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow." sentences = nltk.tokenize.sent_tokenize(text) tokens = [nltk.word_tokenize(s) for s in sentences] print(tokens) # **Part of speech tagging for tokens** # In[ ]: # Downloading nltk packages used in this example nltk.download('maxent_treebank_pos_tagger') pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] print(pos_tagged_tokens) # **Alphabetical list of part-of-speech tags used in the Penn Treebank Project** # # See: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html # # | # | POS Tag | Meaning | # |:-:|:-------:|:--------| # | 1 | CC | Coordinating conjunction| # |2| CD |Cardinal number| # |3| DT |Determiner| # |4| EX |Existential there| # |5| FW |Foreign word| # |6| IN |Preposition or subordinating conjunction| # |7| JJ |Adjective| # |8| JJR |Adjective, comparative| # |9| JJS |Adjective, superlative| # |10| LS |List item marker| # |11| MD |Modal| # |12| NN |Noun, singular or mass| # |13| NNS |Noun, plural| # |14| NNP |Proper noun, singular| # |15| NNPS |Proper noun, plural| # |16| PDT |Predeterminer| # |17| POS |Possessive ending| # |18| PRP |Personal pronoun| # |19| PRP\$ |Possessive pronoun| # |20| RB |Adverb| # |21| RBR |Adverb, comparative| # |22| RBS |Adverb, superlative| # |23| RP |Particle| # |24| SYM |Symbol| # |25| TO |to| # |26| UH |Interjection| # |27| VB |Verb, base form| # |28| VBD |Verb, past tense| # |29| VBG |Verb, gerund or present participle| # |30| VBN |Verb, past participle| # |31| VBP |Verb, non-3rd person singular present| # |32| VBZ |Verb, 3rd person singular present| # |33| WDT |Wh-determiner| # |34| WP |Wh-pronoun| # |35| WP\$|Possessive wh-pronoun| # |36| WRB |Wh-adverb| # **Named entity extraction/chunking for tokens** # In[ ]: # Downloading nltk packages used in this example nltk.download('maxent_ne_chunker') nltk.download('words') # In[ ]: jim = "Jim bought 300 shares of Acme Corp. in 2006." tokens = nltk.word_tokenize(jim) jim_tagged_tokens = nltk.pos_tag(tokens) ne_chunks = nltk.chunk.ne_chunk(jim_tagged_tokens) # In[ ]: ne_chunks # In[ ]: ne_chunks = [nltk.chunk.ne_chunk(ptt) for ptt in pos_tagged_tokens] ne_chunks[0].pprint() ne_chunks[1].pprint() # In[ ]: ne_chunks[0] # In[ ]: ne_chunks[1] # ## Using NLTK’s NLP tools to process human language in blog data # In[ ]: import json import nltk BLOG_DATA = "resources/ch06-webpages/feed.json" blog_data = json.loads(open(BLOG_DATA).read()) # Download nltk packages used in this example nltk.download('stopwords') # Customize your list of stopwords as needed. Here, we add common # punctuation and contraction artifacts. stop_words = nltk.corpus.stopwords.words('english') + [ '.', ',', '--', '\'s', '?', ')', '(', ':', '\'', '\'re', '"', '-', '}', '{', u'—', ']', '[', '...' ] # In[ ]: for post in blog_data: sentences = nltk.tokenize.sent_tokenize(post['content']) words = [w.lower() for sentence in sentences for w in nltk.tokenize.word_tokenize(sentence)] fdist = nltk.FreqDist(words) # Remove stopwords from fdist for sw in stop_words: del fdist[sw] # Basic stats num_words = sum([i[1] for i in fdist.items()]) num_unique_words = len(fdist.keys()) # Hapaxes are words that appear only once num_hapaxes = len(fdist.hapaxes()) top_10_words_sans_stop_words = fdist.most_common(10) print(post['title']) print('\tNum Sentences:'.ljust(25), len(sentences)) print('\tNum Words:'.ljust(25), num_words) print('\tNum Unique Words:'.ljust(25), num_unique_words) print('\tNum Hapaxes:'.ljust(25), num_hapaxes) print('\tTop 10 Most Frequent Words (sans stop words):\n\t\t', \ '\n\t\t'.join(['{0} ({1})'.format(w[0], w[1]) for w in top_10_words_sans_stop_words])) print() # ## A document summarization algorithm based principally upon sentence detection and frequency analysis within sentences # In[ ]: import json import nltk import numpy BLOG_DATA = "feed.json" blog_data = json.loads(open(BLOG_DATA).read()) N = 100 # Number of words to consider CLUSTER_THRESHOLD = 5 # Distance between words to consider TOP_SENTENCES = 5 # Number of sentences to return for a "top n" summary # In[ ]: stop_words = nltk.corpus.stopwords.words('english') + [ '.', ',', '--', '\'s', '?', ')', '(', ':', '\'', '\'re', '"', '-', '}', '{', u'—', '>', '<', '...' ] # In[ ]: # Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn def _score_sentences(sentences, important_words): scores = [] sentence_idx = 0 for s in [nltk.tokenize.word_tokenize(s) for s in sentences]: word_idx = [] # For each word in the word list... for w in important_words: try: # Compute an index for where any important words occur in the sentence. word_idx.append(s.index(w)) except ValueError: # w not in this particular sentence pass word_idx.sort() # It is possible that some sentences may not contain any important words at all. if len(word_idx)== 0: continue # Using the word index, compute clusters by using a max distance threshold # for any two consecutive words. clusters = [] cluster = [word_idx[0]] i = 1 while i < len(word_idx): if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD: cluster.append(word_idx[i]) else: clusters.append(cluster[:]) cluster = [word_idx[i]] i += 1 clusters.append(cluster) # Score each cluster. The max score for any given cluster is the score # for the sentence. max_cluster_score = 0 for c in clusters: significant_words_in_cluster = len(c) # true clusters also contain insignificant words, so we get # the total cluster length by checking the indices total_words_in_cluster = c[-1] - c[0] + 1 score = 1.0 * significant_words_in_cluster**2 / total_words_in_cluster if score > max_cluster_score: max_cluster_score = score scores.append((sentence_idx, max_cluster_score)) sentence_idx += 1 return scores # In[ ]: def summarize(txt): sentences = [s for s in nltk.tokenize.sent_tokenize(txt)] normalized_sentences = [s.lower() for s in sentences] words = [w.lower() for sentence in normalized_sentences for w in nltk.tokenize.word_tokenize(sentence)] fdist = nltk.FreqDist(words) # Remove stopwords from fdist for sw in stop_words: del fdist[sw] top_n_words = [w[0] for w in fdist.most_common(N)] scored_sentences = _score_sentences(normalized_sentences, top_n_words) # Summarization Approach 1: # Filter out nonsignificant sentences by using the average score plus a # fraction of the std dev as a filter avg = numpy.mean([s[1] for s in scored_sentences]) std = numpy.std([s[1] for s in scored_sentences]) mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std] # Summarization Approach 2: # Another approach would be to return only the top N ranked sentences top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:] top_n_scored = sorted(top_n_scored, key=lambda s: s[0]) # Decorate the post object with summaries return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored], mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored]) # In[ ]: for post in blog_data: post.update(summarize(post['content'])) print(post['title']) print('=' * len(post['title'])) print() print('Top N Summary') print('-------------') print(' '.join(post['top_n_summary'])) print() print('Mean Scored Summary') print('-------------------') print(' '.join(post['mean_scored_summary'])) print() # ## Visualizing document summarization results with HTML output # In[ ]: import os from IPython.display import IFrame from IPython.core.display import display HTML_TEMPLATE = """ {0} {1} """ for post in blog_data: # Uses previously defined summarize function. post.update(summarize(post['content'])) # You could also store a version of the full post with key sentences marked up # for analysis with simple string replacement... for summary_type in ['top_n_summary', 'mean_scored_summary']: post[summary_type + '_marked_up'] = '

{0}

'.format(post['content']) for s in post[summary_type]: post[summary_type + '_marked_up'] = \ post[summary_type + '_marked_up'].replace(s, '{0}'.format(s)) filename = post['title'].replace("?", "") + '.summary.' + summary_type + '.html' f = open(os.path.join(filename), 'wb') html = HTML_TEMPLATE.format(post['title'] + ' Summary', post[summary_type + '_marked_up']) f.write(html.encode('utf-8')) f.close() print("Data written to", f.name) # Display any of these files with an inline frame. This displays the # last file processed by using the last value of f.name... print() print("Displaying {0}:".format(f.name)) display(IFrame('files/{0}'.format(f.name), '100%', '600px')) # ## Extracting entities from a text with NLTK # In[ ]: import nltk import json BLOG_DATA = "feed.json" blog_data = json.loads(open(BLOG_DATA).read()) for post in blog_data: sentences = nltk.tokenize.sent_tokenize(post['content']) tokens = [nltk.tokenize.word_tokenize(s) for s in sentences] pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] # Flatten the list since we're not using sentence structure # and sentences are guaranteed to be separated by a special # POS tuple such as ('.', '.') pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent] all_entity_chunks = [] previous_pos = None current_entity_chunk = [] for (token, pos) in pos_tagged_tokens: if pos == previous_pos and pos.startswith('NN'): current_entity_chunk.append(token) elif pos.startswith('NN'): if current_entity_chunk != []: # Note that current_entity_chunk could be a duplicate when appended, # so frequency analysis again becomes a consideration all_entity_chunks.append((' '.join(current_entity_chunk), pos)) current_entity_chunk = [token] previous_pos = pos # Store the chunks as an index for the document # and account for frequency while we're at it... post['entities'] = {} for c in all_entity_chunks: post['entities'][c] = post['entities'].get(c, 0) + 1 # For example, we could display just the title-cased entities print(post['title']) print('-' * len(post['title'])) proper_nouns = [] for (entity, pos) in post['entities']: if entity.istitle(): print('\t{0} ({1})'.format(entity, post['entities'][(entity, pos)])) print() # ## Discovering interactions between entities # In[ ]: import nltk import json BLOG_DATA = "feed.json" def extract_interactions(txt): sentences = nltk.tokenize.sent_tokenize(txt) tokens = [nltk.tokenize.word_tokenize(s) for s in sentences] pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] entity_interactions = [] for sentence in pos_tagged_tokens: all_entity_chunks = [] previous_pos = None current_entity_chunk = [] for (token, pos) in sentence: if pos == previous_pos and pos.startswith('NN'): current_entity_chunk.append(token) elif pos.startswith('NN'): if current_entity_chunk != []: all_entity_chunks.append((' '.join(current_entity_chunk), pos)) current_entity_chunk = [token] previous_pos = pos if len(all_entity_chunks) > 1: entity_interactions.append(all_entity_chunks) else: entity_interactions.append([]) assert len(entity_interactions) == len(sentences) return dict(entity_interactions=entity_interactions, sentences=sentences) blog_data = json.loads(open(BLOG_DATA).read()) # Display selected interactions on a per-sentence basis for post in blog_data: post.update(extract_interactions(post['content'])) print(post['title']) print('-' * len(post['title'])) for interactions in post['entity_interactions']: print('; '.join([i[0] for i in interactions])) print() # ## Visualizing interactions between entities with HTML output # In[ ]: import os import json import nltk from IPython.display import IFrame from IPython.core.display import display BLOG_DATA = "feed.json" HTML_TEMPLATE = """ {0} {1} """ blog_data = json.loads(open(BLOG_DATA).read()) for post in blog_data: post.update(extract_interactions(post['content'])) # Display output as markup with entities presented in bold text post['markup'] = [] for sentence_idx in range(len(post['sentences'])): s = post['sentences'][sentence_idx] for (term, _) in post['entity_interactions'][sentence_idx]: s = s.replace(term, '{0}'.format(term)) post['markup'] += [s] filename = post['title'].replace("?", "") + '.entity_interactions.html' f = open(os.path.join(filename), 'wb') html = HTML_TEMPLATE.format(post['title'] + ' Interactions', ' '.join(post['markup'])) f.write(html.encode('utf-8')) f.close() print('Data written to', f.name) # Display any of these files with an inline frame. This displays the # last file processed by using the last value of f.name... print('Displaying {0}:'.format(f.name)) display(IFrame('files/{0}'.format(f.name), '100%', '600px'))