dragnet
to extract the text from a web page¶Example blog post: http://radar.oreilly.com/2010/07/louvre-industrial-age-henry-ford.html
In Mining the Social Web, 3rd Edition, we used a library called boilerpipe
to extract the main content of web pages. boilerpipe
is a sophisticated piece of software that works very well but has some software dependencies that can be very difficult to install, especially if you do not have administrative privileges on the computer you are working with. I have replaced boilerpipe
with dragnet
, which can be easily installed using pip
.
You can learn more about dragnet
on its GitHub page.
import dragnet
import requests
from dragnet import extract_content
URL='https://www.oreilly.com/ideas/ethics-in-data-project-design-its-about-planning'
r = requests.get(URL)
content = extract_content(r.content)
print(content)
import feedparser # pip install feedparser
FEED_URL='http://feeds.feedburner.com/oreilly/radar/atom'
fp = feedparser.parse(FEED_URL)
for e in fp.entries:
print(e.title)
print(e.links[0].href)
print(e.content[0].value)
import os
import sys
import json
import feedparser
from bs4 import BeautifulSoup
from nltk import clean_html
FEED_URL = 'http://feeds.feedburner.com/oreilly/radar/atom'
def cleanHtml(html):
if html == "": return ""
return BeautifulSoup(html, 'html5lib').get_text()
fp = feedparser.parse(FEED_URL)
print("Fetched {0} entries from '{1}'".format(len(fp.entries[0].title), fp.feed.title))
blog_posts = []
for e in fp.entries:
blog_posts.append({'title': e.title, 'content'
: cleanHtml(e.content[0].value), 'link': e.links[0].href})
out_file = os.path.join('Data/feed.json')
f = open(out_file, 'w+')
f.write(json.dumps(blog_posts, indent=1))
f.close()
print('Wrote output file to {0}'.format(f.name))
import httplib2
import re
from bs4 import BeautifulSoup
http = httplib2.Http()
status, response = http.request('http://www.nytimes.com')
soup = BeautifulSoup(response, 'html5lib')
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http(s?)://")}):
links.append(link.get('href'))
for link in links:
print(link)
Create an empty graph
Create an empty queue to keep track of nodes that need to be processed
Add the starting point to the graph as the root node
Add the root node to a queue for processing
Repeat until some maximum depth is reached or the queue is empty:
Remove a node from the queue
For each of the node's neighbors:
If the neighbor hasn't already been processed:
Add it to the queue
Add it to the graph
Create an edge in the graph that connects the node and its neighbor
Naive sentence detection based on periods
text = "Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow."
print(text.split("."))
More sophisticated sentence detection
import nltk # Installation instructions: http://www.nltk.org/install.html
# Downloading nltk packages used in this example
nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(text)
print(sentences)
harder_example = """My name is John Smith and my email address is j.smith@company.com.
Mostly people call Mr. Smith. But I actually have a Ph.D.!
Can you believe it? Neither can most people..."""
sentences = nltk.tokenize.sent_tokenize(harder_example)
print(sentences)
Word tokenization
text = "Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow."
sentences = nltk.tokenize.sent_tokenize(text)
tokens = [nltk.word_tokenize(s) for s in sentences]
print(tokens)
Part of speech tagging for tokens
# Downloading nltk packages used in this example
nltk.download('maxent_treebank_pos_tagger')
nltk.download('averaged_perceptron_tagger')
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
print(pos_tagged_tokens)
Alphabetical list of part-of-speech tags used in the Penn Treebank Project
See: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
# | POS Tag | Meaning |
---|---|---|
1 | CC | Coordinating conjunction |
2 | CD | Cardinal number |
3 | DT | Determiner |
4 | EX | Existential there |
5 | FW | Foreign word |
6 | IN | Preposition or subordinating conjunction |
7 | JJ | Adjective |
8 | JJR | Adjective, comparative |
9 | JJS | Adjective, superlative |
10 | LS | List item marker |
11 | MD | Modal |
12 | NN | Noun, singular or mass |
13 | NNS | Noun, plural |
14 | NNP | Proper noun, singular |
15 | NNPS | Proper noun, plural |
16 | PDT | Predeterminer |
17 | POS | Possessive ending |
18 | PRP | Personal pronoun |
19 | PRP$ | Possessive pronoun |
20 | RB | Adverb |
21 | RBR | Adverb, comparative |
22 | RBS | Adverb, superlative |
23 | RP | Particle |
24 | SYM | Symbol |
25 | TO | to |
26 | UH | Interjection |
27 | VB | Verb, base form |
28 | VBD | Verb, past tense |
29 | VBG | Verb, gerund or present participle |
30 | VBN | Verb, past participle |
31 | VBP | Verb, non-3rd person singular present |
32 | VBZ | Verb, 3rd person singular present |
33 | WDT | Wh-determiner |
34 | WP | Wh-pronoun |
35 | WP$ | Possessive wh-pronoun |
36 | WRB | Wh-adverb |
Named entity extraction/chunking for tokens
# Downloading nltk packages used in this example
nltk.download('maxent_ne_chunker')
nltk.download('words')
import nltk
jim = "Jim bought 300 shares of Acme Corp. in 2006."
tokens = nltk.word_tokenize(jim)
jim_tagged_tokens = nltk.pos_tag(tokens)
ne_chunks = nltk.chunk.ne_chunk(jim_tagged_tokens)
print(ne_chunks)
ne_chunks = [nltk.chunk.ne_chunk(ptt) for ptt in pos_tagged_tokens]
ne_chunks[0].pprint()
ne_chunks[1].pprint()
import json
import nltk
BLOG_DATA = "Data/feed.json"
blog_data = json.loads(open(BLOG_DATA).read())
# Download nltk packages used in this example
nltk.download('stopwords')
# Customize your list of stopwords as needed. Here, we add common
# punctuation and contraction artifacts.
stop_words = nltk.corpus.stopwords.words('english') + [
'.',
',',
'--',
'\'s',
'?',
')',
'(',
':',
';',
'\'',
'\'re',
'"',
'-',
'}',
'{',
u'—',
']',
'[',
'...',
"“",
"”",
"’"
]
for post in blog_data:
sentences = nltk.tokenize.sent_tokenize(post['content'])
words = [w.lower() for sentence in sentences for w in
nltk.tokenize.word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
# Remove stopwords from fdist
for sw in stop_words:
del fdist[sw]
# Basic stats
num_words = sum([i[1] for i in fdist.items()])
num_unique_words = len(fdist.keys())
# Hapaxes are words that appear only once
num_hapaxes = len(fdist.hapaxes())
top_10_words_sans_stop_words = fdist.most_common(10)
print(post['title'])
print('\tNum Sentences:'.ljust(25), len(sentences))
print('\tNum Words:'.ljust(25), num_words)
print('\tNum Unique Words:'.ljust(25), num_unique_words)
print('\tNum Hapaxes:'.ljust(25), num_hapaxes)
print('\tTop 10 Most Frequent Words (sans stop words):\n\t\t', \
'\n\t\t'.join(['{0} ({1})'.format(w[0], w[1]) for w in top_10_words_sans_stop_words]))
print()
import json
import nltk
import numpy
BLOG_DATA = "Data/feed.json"
blog_data = json.loads(open(BLOG_DATA).read())
N = 100 # Number of words to consider
CLUSTER_THRESHOLD = 5 # Distance between words to consider
TOP_SENTENCES = 5 # Number of sentences to return for a "top n" summary
# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
def _score_sentences(sentences, important_words):
scores = []
sentence_idx = 0
for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:
word_idx = []
# For each word in the word list...
for w in important_words:
try:
# Compute an index for where any important words occur in the sentence.
word_idx.append(s.index(w))
except ValueError: # w not in this particular sentence
pass
word_idx.sort()
# It is possible that some sentences may not contain any important words at all.
if len(word_idx)== 0: continue
# Using the word index, compute clusters by using a max distance threshold
# for any two consecutive words.
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
cluster.append(word_idx[i])
else:
clusters.append(cluster[:])
cluster = [word_idx[i]]
i += 1
clusters.append(cluster)
# Score each cluster. The max score for any given cluster is the score
# for the sentence.
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
# true clusters also contain insignificant words, so we get
# the total cluster length by checking the indices
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster**2 / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, max_cluster_score))
sentence_idx += 1
return scores
def summarize(txt):
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
nltk.tokenize.word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
# Remove stopwords from fdist
for sw in stop_words:
del fdist[sw]
top_n_words = [w[0] for w in fdist.most_common(N)]
scored_sentences = _score_sentences(normalized_sentences, top_n_words)
# Summarization Approach 1:
# Filter out nonsignificant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
for post in blog_data:
post.update(summarize(post['content']))
print(post['title'])
print('=' * len(post['title']))
print()
print('Top N Summary')
print('-------------')
print(' '.join(post['top_n_summary']))
print()
print('Mean Scored Summary')
print('-------------------')
print(' '.join(post['mean_scored_summary']))
print()
import os
from IPython.display import IFrame
from IPython.core.display import display
# Make a folder to dump all the HTML content
if not os.path.exists('Data/Webpages'):
os.makedirs('Data/Webpages')
# Multiline string template of HTML
HTML_TEMPLATE = """<html>
<head>
<title>{0}</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>{1}</body>
</html>"""
for post in blog_data:
# Uses previously defined summarize function.
post.update(summarize(post['content']))
# You could also store a version of the full post with key sentences marked up
# for analysis with simple string replacement...
for summary_type in ['top_n_summary', 'mean_scored_summary']:
post[summary_type + '_marked_up'] = '<p>{0}</p>'.format(post['content'])
for s in post[summary_type]:
post[summary_type + '_marked_up'] = \
post[summary_type + '_marked_up'].replace(s, '<strong>{0}</strong>'.format(s))
filename = post['title'].replace("?", "") + '.summary.' + summary_type + '.html'
f = open(os.path.join('Data/Webpages/'+filename), 'wb')
html = HTML_TEMPLATE.format(post['title'] + ' Summary', post[summary_type + '_marked_up'])
f.write(html.encode('utf-8'))
f.close()
print("Data written to", f.name)
# Display any of these files with an inline frame. This displays the
# last file processed by using the last value of f.name...
print()
print("Displaying {0}:".format(f.name))
display(IFrame('files/{0}'.format(f.name), '100%', '600px'))
import nltk
import json
BLOG_DATA = "Data/feed.json"
blog_data = json.loads(open(BLOG_DATA).read())
for post in blog_data:
sentences = nltk.tokenize.sent_tokenize(post['content'])
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
# Flatten the list since we're not using sentence structure
# and sentences are guaranteed to be separated by a special
# POS tuple such as ('.', '.')
pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent]
all_entity_chunks = []
previous_pos = None
current_entity_chunk = []
for (token, pos) in pos_tagged_tokens:
if pos == previous_pos and pos.startswith('NN'):
current_entity_chunk.append(token)
elif pos.startswith('NN'):
if current_entity_chunk != []:
# Note that current_entity_chunk could be a duplicate when appended,
# so frequency analysis again becomes a consideration
all_entity_chunks.append((' '.join(current_entity_chunk), pos))
current_entity_chunk = [token]
previous_pos = pos
# Store the chunks as an index for the document
# and account for frequency while we're at it...
post['entities'] = {}
for c in all_entity_chunks:
post['entities'][c] = post['entities'].get(c, 0) + 1
# For example, we could display just the title-cased entities
print(post['title'])
print('-' * len(post['title']))
proper_nouns = []
for (entity, pos) in post['entities']:
if entity.istitle():
print('\t{0} ({1})'.format(entity, post['entities'][(entity, pos)]))
print()
import nltk
import json
BLOG_DATA = "Data/feed.json"
def extract_interactions(txt):
sentences = nltk.tokenize.sent_tokenize(txt)
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
entity_interactions = []
for sentence in pos_tagged_tokens:
all_entity_chunks = []
previous_pos = None
current_entity_chunk = []
for (token, pos) in sentence:
if pos == previous_pos and pos.startswith('NN'):
current_entity_chunk.append(token)
elif pos.startswith('NN'):
if current_entity_chunk != []:
all_entity_chunks.append((' '.join(current_entity_chunk),
pos))
current_entity_chunk = [token]
previous_pos = pos
if len(all_entity_chunks) > 1:
entity_interactions.append(all_entity_chunks)
else:
entity_interactions.append([])
assert len(entity_interactions) == len(sentences)
return dict(entity_interactions=entity_interactions,
sentences=sentences)
blog_data = json.loads(open(BLOG_DATA).read())
# Display selected interactions on a per-sentence basis
for post in blog_data:
post.update(extract_interactions(post['content']))
print(post['title'])
print('-' * len(post['title']))
for interactions in post['entity_interactions']:
print('; '.join([i[0] for i in interactions]))
print()
import os
import json
import nltk
from IPython.display import IFrame
from IPython.core.display import display
BLOG_DATA = "Data/feed.json"
HTML_TEMPLATE = """<html>
<head>
<title>{0}</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>{1}</body>
</html>"""
blog_data = json.loads(open(BLOG_DATA).read())
for post in blog_data:
post.update(extract_interactions(post['content']))
# Display output as markup with entities presented in bold text
post['markup'] = []
for sentence_idx in range(len(post['sentences'])):
s = post['sentences'][sentence_idx]
for (term, _) in post['entity_interactions'][sentence_idx]:
s = s.replace(term, '<strong>{0}</strong>'.format(term))
post['markup'] += [s]
filename = post['title'].replace("?", "") + '.entity_interactions.html'
f = open(os.path.join('Data/Webpages/'+filename), 'wb')
html = HTML_TEMPLATE.format(post['title'] + ' Interactions', ' '.join(post['markup']))
f.write(html.encode('utf-8'))
f.close()
print('Data written to', f.name)
# Display any of these files with an inline frame. This displays the
# last file processed by using the last value of f.name...
print('Displaying {0}:'.format(f.name))
display(IFrame('files/{0}'.format(f.name), '100%', '600px'))