# Downloading nltk packages used in this example
ne_chunks = list(nltk.chunk.ne_chunk_sents(pos_tagged_tokens))
# May also require the installation of Java runtime libraries
# pip install boilerpipe3
from boilerpipe.extract import Extractor
extractor = Extractor(extractor='ArticleExtractor', url=URL)
import feedparser # pip install feedparser
fp = feedparser.parse(FEED_URL)
for e in fp.entries:
import os
import sys
import json
import feedparser
from bs4 import BeautifulSoup
from nltk import clean_html
FEED_URL = 'http://feeds.feedburner.com/oreilly/radar/atom'
def cleanHtml(html):
if html == "": return ""
return BeautifulSoup(html, 'html5lib').get_text()
fp = feedparser.parse(FEED_URL)
print("Fetched {0} entries from '{1}'".format(len(fp.entries[0].title), fp.feed.title))
blog_posts = []
for e in fp.entries:
blog_posts.append({'title': e.title, 'content'
: cleanHtml(e.content[0].value), 'link': e.links[0].href})
out_file = os.path.join('feed.json')
f = open(out_file, 'w+')
f.write(json.dumps(blog_posts, indent=1))
print('Wrote output file to {0}'.format(f.name))
import httplib2
import re
from bs4 import BeautifulSoup
http = httplib2.Http()
status, response = http.request('http://www.nytimes.com')
soup = BeautifulSoup(response, 'html5lib')
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http(s?)://")}):
for link in links:
Naive sentence detection based on periods
text = "Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow."
More sophisticated sentence detection
import nltk # Installation instructions: http://www.nltk.org/install.html
# Downloading nltk packages used in this example
sentences = nltk.tokenize.sent_tokenize(text)
harder_example = """My name is John Smith and my email address is j.smith@company.com.
Mostly people call Mr. Smith. But I actually have a Ph.D.!
Can you believe it? Neither can most people..."""
sentences = nltk.tokenize.sent_tokenize(harder_example)
Word tokenization
text = "Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow."
sentences = nltk.tokenize.sent_tokenize(text)
tokens = [nltk.word_tokenize(s) for s in sentences]
Part of speech tagging for tokens
# Downloading nltk packages used in this example
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
Alphabetical list of part-of-speech tags used in the Penn Treebank Project
See: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
# | POS Tag | Meaning |
1 | CC | Coordinating conjunction |
2 | CD | Cardinal number |
3 | DT | Determiner |
4 | EX | Existential there |
5 | FW | Foreign word |
6 | IN | Preposition or subordinating conjunction |
7 | JJ | Adjective |
8 | JJR | Adjective, comparative |
9 | JJS | Adjective, superlative |
10 | LS | List item marker |
11 | MD | Modal |
12 | NN | Noun, singular or mass |
13 | NNS | Noun, plural |
14 | NNP | Proper noun, singular |
15 | NNPS | Proper noun, plural |
16 | PDT | Predeterminer |
17 | POS | Possessive ending |
18 | PRP | Personal pronoun |
19 | PRP$ | Possessive pronoun |
20 | RB | Adverb |
21 | RBR | Adverb, comparative |
22 | RBS | Adverb, superlative |
23 | RP | Particle |
24 | SYM | Symbol |
25 | TO | to |
26 | UH | Interjection |
27 | VB | Verb, base form |
28 | VBD | Verb, past tense |
29 | VBG | Verb, gerund or present participle |
30 | VBN | Verb, past participle |
31 | VBP | Verb, non-3rd person singular present |
32 | VBZ | Verb, 3rd person singular present |
33 | WDT | Wh-determiner |
34 | WP | Wh-pronoun |
35 | WP$ | Possessive wh-pronoun |
36 | WRB | Wh-adverb |
Named entity extraction/chunking for tokens
# Downloading nltk packages used in this example
jim = "Jim bought 300 shares of Acme Corp. in 2006."
tokens = nltk.word_tokenize(jim)
jim_tagged_tokens = nltk.pos_tag(tokens)
ne_chunks = nltk.chunk.ne_chunk(jim_tagged_tokens)
ne_chunks = [nltk.chunk.ne_chunk(ptt) for ptt in pos_tagged_tokens]
import json
import nltk
BLOG_DATA = "resources/ch06-webpages/feed.json"
blog_data = json.loads(open(BLOG_DATA).read())
# Download nltk packages used in this example
# Customize your list of stopwords as needed. Here, we add common
# punctuation and contraction artifacts.
stop_words = nltk.corpus.stopwords.words('english') + [
for post in blog_data:
sentences = nltk.tokenize.sent_tokenize(post['content'])
words = [w.lower() for sentence in sentences for w in
fdist = nltk.FreqDist(words)
# Remove stopwords from fdist
for sw in stop_words:
del fdist[sw]
# Basic stats
num_words = sum([i[1] for i in fdist.items()])
num_unique_words = len(fdist.keys())
# Hapaxes are words that appear only once
num_hapaxes = len(fdist.hapaxes())
top_10_words_sans_stop_words = fdist.most_common(10)
print('\tNum Sentences:'.ljust(25), len(sentences))
print('\tNum Words:'.ljust(25), num_words)
print('\tNum Unique Words:'.ljust(25), num_unique_words)
print('\tNum Hapaxes:'.ljust(25), num_hapaxes)
print('\tTop 10 Most Frequent Words (sans stop words):\n\t\t', \
'\n\t\t'.join(['{0} ({1})'.format(w[0], w[1]) for w in top_10_words_sans_stop_words]))
import json
import nltk
import numpy
BLOG_DATA = "feed.json"
blog_data = json.loads(open(BLOG_DATA).read())
N = 100 # Number of words to consider
CLUSTER_THRESHOLD = 5 # Distance between words to consider
TOP_SENTENCES = 5 # Number of sentences to return for a "top n" summary
stop_words = nltk.corpus.stopwords.words('english') + [
# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
def _score_sentences(sentences, important_words):
scores = []
sentence_idx = 0
for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:
word_idx = []
# For each word in the word list...
for w in important_words:
# Compute an index for where any important words occur in the sentence.
except ValueError: # w not in this particular sentence
# It is possible that some sentences may not contain any important words at all.
if len(word_idx)== 0: continue
# Using the word index, compute clusters by using a max distance threshold
# for any two consecutive words.
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
cluster = [word_idx[i]]
i += 1
# Score each cluster. The max score for any given cluster is the score
# for the sentence.
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
# true clusters also contain insignificant words, so we get
# the total cluster length by checking the indices
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster**2 / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, max_cluster_score))
sentence_idx += 1
return scores
def summarize(txt):
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
fdist = nltk.FreqDist(words)
# Remove stopwords from fdist
for sw in stop_words:
del fdist[sw]
top_n_words = [w[0] for w in fdist.most_common(N)]
scored_sentences = _score_sentences(normalized_sentences, top_n_words)
# Summarization Approach 1:
# Filter out nonsignificant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
for post in blog_data:
print('=' * len(post['title']))
print('Top N Summary')
print(' '.join(post['top_n_summary']))
print('Mean Scored Summary')
print(' '.join(post['mean_scored_summary']))
import os
from IPython.display import IFrame
from IPython.core.display import display
HTML_TEMPLATE = """<html>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
for post in blog_data:
# Uses previously defined summarize function.
# You could also store a version of the full post with key sentences marked up
# for analysis with simple string replacement...
for summary_type in ['top_n_summary', 'mean_scored_summary']:
post[summary_type + '_marked_up'] = '<p>{0}</p>'.format(post['content'])
for s in post[summary_type]:
post[summary_type + '_marked_up'] = \
post[summary_type + '_marked_up'].replace(s, '<strong>{0}</strong>'.format(s))
filename = post['title'].replace("?", "") + '.summary.' + summary_type + '.html'
f = open(os.path.join(filename), 'wb')
html = HTML_TEMPLATE.format(post['title'] + ' Summary', post[summary_type + '_marked_up'])
print("Data written to", f.name)
# Display any of these files with an inline frame. This displays the
# last file processed by using the last value of f.name...
print("Displaying {0}:".format(f.name))
display(IFrame('files/{0}'.format(f.name), '100%', '600px'))
import nltk
import json
BLOG_DATA = "feed.json"
blog_data = json.loads(open(BLOG_DATA).read())
for post in blog_data:
sentences = nltk.tokenize.sent_tokenize(post['content'])
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
# Flatten the list since we're not using sentence structure
# and sentences are guaranteed to be separated by a special
# POS tuple such as ('.', '.')
pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent]
all_entity_chunks = []
previous_pos = None
current_entity_chunk = []
for (token, pos) in pos_tagged_tokens:
if pos == previous_pos and pos.startswith('NN'):
elif pos.startswith('NN'):
if current_entity_chunk != []:
# Note that current_entity_chunk could be a duplicate when appended,
# so frequency analysis again becomes a consideration
all_entity_chunks.append((' '.join(current_entity_chunk), pos))
current_entity_chunk = [token]
previous_pos = pos
# Store the chunks as an index for the document
# and account for frequency while we're at it...
post['entities'] = {}
for c in all_entity_chunks:
post['entities'][c] = post['entities'].get(c, 0) + 1
# For example, we could display just the title-cased entities
print('-' * len(post['title']))
proper_nouns = []
for (entity, pos) in post['entities']:
if entity.istitle():
print('\t{0} ({1})'.format(entity, post['entities'][(entity, pos)]))
import nltk
import json
BLOG_DATA = "feed.json"
def extract_interactions(txt):
sentences = nltk.tokenize.sent_tokenize(txt)
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
entity_interactions = []
for sentence in pos_tagged_tokens:
all_entity_chunks = []
previous_pos = None
current_entity_chunk = []
for (token, pos) in sentence:
if pos == previous_pos and pos.startswith('NN'):
elif pos.startswith('NN'):
if current_entity_chunk != []:
all_entity_chunks.append((' '.join(current_entity_chunk),
current_entity_chunk = [token]
previous_pos = pos
if len(all_entity_chunks) > 1:
assert len(entity_interactions) == len(sentences)
return dict(entity_interactions=entity_interactions,
blog_data = json.loads(open(BLOG_DATA).read())
# Display selected interactions on a per-sentence basis
for post in blog_data:
print('-' * len(post['title']))
for interactions in post['entity_interactions']:
print('; '.join([i[0] for i in interactions]))
import os
import json
import nltk
from IPython.display import IFrame
from IPython.core.display import display
BLOG_DATA = "feed.json"
HTML_TEMPLATE = """<html>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
blog_data = json.loads(open(BLOG_DATA).read())
for post in blog_data:
# Display output as markup with entities presented in bold text
post['markup'] = []
for sentence_idx in range(len(post['sentences'])):
s = post['sentences'][sentence_idx]
for (term, _) in post['entity_interactions'][sentence_idx]:
s = s.replace(term, '<strong>{0}</strong>'.format(term))
post['markup'] += [s]
filename = post['title'].replace("?", "") + '.entity_interactions.html'
f = open(os.path.join(filename), 'wb')
html = HTML_TEMPLATE.format(post['title'] + ' Interactions', ' '.join(post['markup']))
print('Data written to', f.name)
# Display any of these files with an inline frame. This displays the
# last file processed by using the last value of f.name...
print('Displaying {0}:'.format(f.name))
display(IFrame('files/{0}'.format(f.name), '100%', '600px'))