#!/usr/bin/env python # coding: utf-8 # # Portmantout Words # # A *portmanteau* is a word that squishes together two overlapping words, like *math* + *athlete* = *mathlete*. You can make up your own, like *eskimo* + *kimono* = *eskimono*. Inspired by Darius Bacon, I covered this as a programming exercise in my 2012 [Udacity course](https://www.udacity.com/course/design-of-computer-programs--cs212). Recently I was re-inspired by [Tom Murphy VII](http://www.cs.cmu.edu/~tom7), who added a new twist: **[portmantout words](http://www.cs.cmu.edu/~tom7/portmantout/)**, which are defined as: # # > Given a set of words, W, a portmantout of W is a string, S, such that: # * Each word in W is a substring of S. # * S is formed by joining together an ordered list, L, of words from W (possibly with repeats). # * For each word in L, some ending letter(s) must match the starting letters of the next word. # * When L is joined into S, those overlapping letters appear only once. # * The shorter the string S, the better. # # To make sure we understand the definition, I'll define the function `S = portman(L).` To find the overlap between two words, `portman` considers each suffix of the previous word and takes the longest suffix that starts the next word; we drop that from the word. (I'll also define functions to list the prefixes and suffixes of a word.) # In[1]: def portman(words): "Join a sequence of words, eliminating from each word the overlap with previous word." prev = words[0] result = [prev] for word in words[1:]: overlap = next(filter(word.startswith, suffixes(prev))) result.append(word[len(overlap):]) prev = word return ''.join(result) def prefixes(word) -> list: "All non-empty prefixes of word, shortest first." return [word[:i+1] for i in range(len(word))] def suffixes(word) -> list: "All non-empty suffixes of word, longest first." return [word[i:] for i in range(len(word))] # In[2]: # An example: W = {'on', 'one', 'two', 'won'} S = 'twone' L = ['two', 'won', 'on', 'one'] assert portman(L) == S assert all(w in S for w in W) assert set(L) == W assert portman(['eskimo', 'kimono', 'monolith']) == 'eskimonolith' assert prefixes('word') == ['w', 'wo', 'wor', 'word'] assert suffixes('word') == ['word', 'ord', 'rd', 'd'] # Note that `portman(['one', 'two'])` would raise an error, because there is no overlap from `'one'` to `'two'`. # # # Problem-Solving Strategy # # I watched part of Tom Murphy's entertaining [video](https://www.youtube.com/watch?time_continue=1&v=QVn2PZGZxaI), but then I stopped because I wanted to solve the problem at least partially on my own. Here's my strategy: # # 1. To start with, I'll need a list of words. Murphy provides [108,709 words](wordlist.asc) that I'll read that into a `Dictionary` data structure. # 2. This is clearly an NP-hard problem, so a greedy solution will suffice. # 3. In my [traveling salesperson notebook](TSP.ipynb) I show a greedy nearest neighbor solution: start at one city and continually go to the nearest unvisited city until all cities are visited. We can do a similar thing here: start at one word, and repeatedly go to the previously-unused word that maximizes overlap, until all words are used. # 4. However, this strategy has a problem: at some point it is likely that *none* of the remaining unused words overlap the current word. In that case, we'll need to re-use a word; I'll try to choose one that is short and that *bridges* to one of the remaining unused words. # # # # Loading the Dictionary # # My `Dictionary` data structure contains three things: # - A set of words. # - A set of what I call *unique words*: the words that are not contained within other words. For example, "on" is contained within "one", so "on" would be in the set of words, but not the set of unique words. # - Above I said I wanted to find "previously-unused word that maximizes overlap." To facilitate that, I'll store a mapping from an overlap to a list of the words that have that overlap as a prefix, for example `{'o': ['on', 'one', ...], ...}` # In[3]: from collections import defaultdict, Counter from functools import lru_cache class Dictionary: "A collection of words, with prefixes and unique words precomputed." def __init__(self, stream): self.words = {w.strip().lower() for w in stream} self.uniq = self.words - subwords(self.words) self.pre = multimap((p, w) for w in self.words for p in prefixes(w)) def subwords(words) -> set: "All words that are hiding inside any of these words." return {sub for word in words for i in range(len(word)) for sub in prefixes(word[i:]) if sub is not word and sub in words} def multimap(pairs) -> dict: "Given (key, val) pairs, make a dict of {key: [val,...]}." result = defaultdict(list) for key, val in pairs: result[key].append(val) return result # In[4]: # A small example: W = {'on', 'one', 'two', 'won'} d = Dictionary(W) d.words, d.uniq, dict(d.pre) # That looks right. Now I can load the big dictionary, call it `D`, and explore it a bit: # In[5]: D = Dictionary(open('wordlist.asc')) # In[6]: len(D.words), len(D.uniq), len(D.pre), D.pre['somew'] # # Finding Sequences of Overlapping Words # # The function `natalie` will take a Dictionary as input and produce a list of overlapping words, e.g. # # d = Dictionary({'on', 'one', 'two', 'won'}) # natalie(d) ⇒ ['two', 'won', 'on', 'one'] # # and we will solve the whole problem as follows: # # portman(natalie(d)) ⇒ 'twone' # # Within `natalie`, we repeatedly add words to a `result` list until we have used up all the unique `words` from the dictionary. On each iteration we either add a `new_word` (thus decreasing the size of the remaining `words`), or we re-use a `repeated_word`, choosing one that will bridge to a word that we have not used yet. # In[7]: def natalie(d: Dictionary) -> list: "Return a list of words that cover the dictionary." words = set(d.uniq) # all the words we need to cover result = [words.pop()] # a list of overlapping words firsts = Counter(word[0] for word in words) # Count of first letters of words while words: prev = result[-1] word = (new_word(d, prev, words) or repeated_word(d, prev, firsts)) result.append(word) if word in words: words.remove(word) B = word[0] firsts[B] -= 1 if not firsts[B]: del firsts[B] return result # # Selecting the Next Word # # Selecting a `new_word` is easy: consider the suffixes of the previous word, longest suffix first, and if that suffix is a prefix of an unused word, then take it. # In[8]: def new_word(d: Dictionary, prev: str, words: set) -> str or None: "Find an overlapping word to follow the previous word (or None)." for suf in suffixes(prev): if suf in d.pre: for word in d.pre[suf]: if word in words: return word # Now suppose we reach a situation where the previous word was `'one'`, and the only remaining unused word is `'two'`. Since there is no overlap, `new_word` will fail, but we can find the shortest previously-used word that will `bridge` from the `'e'` at the end of `'one'` to the `'t'` at the start of `'two'`: # In[9]: @lru_cache(1000) def bridge(d, A, B) -> str: "Find a word that bridges A to B: it starts with A and ends with B." return shortest(word for word in d.pre[A] if word.endswith(B)) def shortest(items): return min(items, key=len, default=None) # In[10]: bridge(D, 'e', 't') # But in general, there will be several unusued words that we might bridge to; we will greedily choose the shortest bridge to any of the unused words. That sounds expensive when there are thousands of unused words, so we will summarize them with a counter, `firsts`, that gives for each letter the number of unused words that start with that letter (so, if the unused words are `{'two', 'three', 'four'}`, then `firsts` will be `{'t': 2, 'f': 1}`). # # Furthermore, it might be that *no* word can bridge directly to an unused word. In that case we can take two steps, first bridging from `A` to an intermediate letter `L`, and then bridging from `L` to a letter `B` that starts some unused word. The function `repeated_word` handles all these cases: # In[11]: def repeated_word(d: Dictionary, prev: str, firsts: Counter) -> str: "Find a previously-used word that will bridge to one of the letters in firsts." A = prev[-1] word = shortest(bridge(d, A, B) for B in firsts if bridge(d, A, B)) if word: return word else: candidates = [[bridge(d, A, L), bridge(d, L, B)] for L in alphabet for B in firsts if A != L != B and bridge(d, A, L) and bridge(d, L, B)] return min(candidates, key=lambda seq: sum(map(len, seq)))[0] alphabet = 'abcdefghijklmnopqrstuvwxyz' # For example, suppose the previous word is `'one'`, and the only remaining unused word is `'quit'`. There is no word that bridges from `'e'` to `'q'`. So we will have to get there in two steps. Here's the first step: # In[12]: repeated_word(D, 'one', {'q': 1}) # And here's the second: # In[13]: repeated_word(D, 'elhi', {'q': 1}) # # A Solution # # We're ready to solve the problem! # In[14]: get_ipython().run_cell_magic('time', '', 'L = natalie(D)\nS = portman(L)\n') # In[15]: len(L), len(D.uniq), len(S) # Here we see the start of the list: # In[16]: L[:20] # And the start of the string: # In[17]: S[:2000] # If you want to see the whole string, I'll write it to the file [natalie.txt](natalie.txt). # In[18]: open('natalie.txt', 'w').write(S) # # Next Steps # # Each time I run this, I get a slightly different result (there are no `random` calls, but each Python re-start gets a random hash seed, which results in a different iteration order over dicts and sets). I had originally planned to add some randomness and take the best of *k* trials (like I did in my [TSP](TSP.ipynb) notebook), but all the trials I have seen so far fall within a narrow range of 575,000 to 581,000 letters, so I think it is not worth the effort for what would probably be a 1% improvement at best. My string is 6% shorter than Murphy's solution with 611,820 letters. # # I'll stop here (but you should feel free to do some experimentation of your own). Some ideas: # # - One weakness of my approach is that it can get stuck. With `W = {'one', 'two'}`, if my algorithm chooses `'one'` as the starting word, it will fail. You could fix that by allowing new words to be added to the start of the list (hint: use a `dequeue`) as well as the end. # - Following my [TSP](TSP.ipynb) notebook, an alternative to maintaining a single list and adding the maximum-overlap word is to maintain multiple lists, and on each iteration merge the lists with maximum overlap. # - With the 108,709 it is always possible to bridge from any letter to any other letter in at most two steps. But for smaller dictionaries, that might not be the case. You could consider that. # - To minimize the length of S, we can do three things: get better overlap between the unique words, use fewer/shorter bridging words, or get better overlap of the bridging words. Can you think of strategies to achieve any of these? # # ## Unit Tests # # This gives some examples of how the functions are used, and some assurance that they are doing the right thing. # In[19]: def test(D): W = {'on', 'one', 'two', 'won'} S = 'twone' L = ['two', 'won', 'on', 'one'] assert portman(L) == S assert all(w in S for w in W) assert set(L) == W assert prefixes('word') == ['w', 'wo', 'wor', 'word'] assert suffixes('word') == ['word', 'ord', 'rd', 'd'] assert subwords({'hello', 'world', 'he', 'hell', 'el', 'or'}) == ( {'el', 'he', 'hell', 'or'}) assert multimap([(1, 10), (2, 20), (3, 30), (2, 22), (3, 33)]) == ( {1: [10], 2: [20, 22], 3: [30, 33]}) assert 'portmanteau' in D.words assert 'port' in D.words assert 'port' not in D.uniq assert set(D.pre['hello']) == {'hello', 'helloed', 'helloes', 'helloing', 'hellos'} assert bridge(D, 'a', 'z') == 'abuzz' assert bridge(D, 'a', 't') == 'at' assert bridge(D, 'e', 't') == 'eat' assert bridge(D, 'f', 'd') == 'fad' assert portman(['two', 'won', 'on', 'one']) == 'twone' assert portman(['eskimo', 'kimono', 'monolith']) == 'eskimonolith' return 'pass' test(D)