Optimal Alphabetical Order

May 14, 2013

Brian Kell, who wrote the page we are looking at, gives a list of 67,230 words that we read into a list of lists of characters in the variable words:

(define (get-words filename)
  (with-input-from-file filename
    (lambda ()
      (let loop ((word (read-line)) (words (list)))
        (if (eof-object? word) (reverse words)
          (let ((word (map char-upcase (string->list word))))
            (loop (read-line) (cons word words))))))))

(define words (get-words "words.txt"))

The key defines the alphabetical ordering of the letters in a word. Here we define the base key, which is the standard alphabet, and write a function that displays a key; the key is stored as a vector of positions, indexed by the alphabet with A = 1 and Z = 26:

(define alpha (vector 1 2 3 4 5 6 7 8 9 10 11 12
  13 14 15 16 17 18 19 20 21 22 23 24 25 26))

(define (display-key key)
  (let loop ((ks (vector->list key)) (xs (list)))
    (if (null? ks)
        (list->string (reverse xs))
        (loop (cdr ks) (cons (integer->char (+ (car ks) 64)) xs)))))

Function alpha? determines if a word is in alphabetical order according to the current key:

(define (alpha? word key)
  (let loop ((cs word) (prev 0))
    (if (null? cs) #t
      (let ((curr (vector-ref key (- (char->integer (car cs)) 65))))
        (if (< curr prev) #f (loop (cdr cs) curr))))))

Then we can count the number of words in the word list that are in alphabetical order according to the given key:

(define (count words key)
  (length (filter (lambda (w) (alpha? w key)) words)))

To find the key that produces the maximum number of words in alphabetical order, we will use a kind of hill-climbing technique. Each time through the main loop we compute the number of words that are in alphabetical order according to the current key. If the new key improves on the current key, we display the new key and keep the new maximum in prev. Then we compute a new key and loop. Most of the time the new key is computed by swapping two letters in the current key, using a function alter. Every 2000 times through the loop we call alter an extra time, and every 50000 times through the loop we call alter three extra times; the idea is to move more quickly to another local maximum. Every million times through the loop we shuffle the key, potentially moving every letter in the key, so that we can restart the search for a new local maximum:

(define (climb words key)
  (let ((prev (count words key)))
    (display (display-key key))
    (display " ") (display prev) (newline)
    (let loop ((key key) (prev prev))
      (let* ((new-key (alter key))
             (score (count words new-key)))
        (cond ((< prev score)
                (display (display-key key))
                (display " ") (display score) (newline)
                (loop new-key score))
              ((zero? (randint 1000000))
                (loop (shuffle key) prev))
              ((zero? (randint 50000))
                (loop (alter (alter (alter new-key))) prev))
              ((zero? (randint 2000))
                (loop (alter new-key) prev))
              (else (loop key prev)))))))

Here are the alter and shuffle functions.

(define (alter key)
  (let ((p (randint 0 26)) (q (randint 0 26)))
    (let ((t (vector-ref key p)))
      (vector-set! key p (vector-ref key q))
      (vector-set! key q t)))
  key)

(define (shuffle vec)
  (do ((n (vector-length vec) (- n 1)))
      ((zero? n) vec)
    (let* ((r (randint n)) (t (vector-ref vec r)))
      (vector-set! vec r (vector-ref vec (- n 1)))
      (vector-set! vec (- n 1) t))))

To run the program, say (climb words (shuffle alpha))

The best key I found at the time of writing is KADVQXLHNJCRIOGTBEYPMUFSZW, which gives 2328 words in alphabetical order, but I left the program running on my machine at work and hope to have a better key in the morning; on his web page, Kell gives a key with 4046 words in alphabetical order.

We used randint from the Standard Prelude. You can run the program at http://programmingpraxis.codepad.org/DLzOQf2A, which provides a small dictionary for demonstration and testing. The best key I found for that small dictionary is ELBVYMOCJDRGXKHAWUNTPFIQZS, which gives 16 words in alphabetical order.

Pages: 1 2

10 Responses to “Optimal Alphabetical Order”

  1. Paul said

    This Python version gives a score of 4038 words in right order after about 3.5 minutes.

    import string
    import random
    
    words = [w[:-1] for w in open("words.txt").readlines()]
    
    key = dict(zip(string.ascii_uppercase, range(26)))
    
    def optimal_word(word, key):
        last = -1
        for c in word:
            if key[c] >= last:
                last = key[c]
            else:
                return False
        return True
        
    def score(key):
        return sum(1 for w in words if optimal_word(w, key))
        
    def random_char():
        return random.choice(string.ascii_uppercase)
        
    def try_key2(key, sc):
        """ swap 2 chars"""
        newkey = dict(key)
        i, j = random_char(), random_char()
        if i != j:
            newkey[i], newkey[j] = key[j], key[i]
            nsc = score(newkey)
            if nsc > sc:
                return newkey, nsc
        return False
        
    def try_key3(key, sc):
        newkey = dict(key)
        i, j. k = random_char(), random_char(), random_char()
        if i != j and i != k and j != k:
            newkey[i], newkey[j], newkey[k] = key[j], key[k], key[i]
            nsc = score(newkey)
            if nsc > sc:
                return newkey, nsc
        return False
        
    def optimize(key):
        sc = score(key)
        print sc
        for i in range(1000000):
            if random.randint(0,1) == 0:
                res = try_key2(key, sc)
            else:
                res = try_key3(key, sc)
            if res:
                key, sc = res
                print sc
        return key
        
    print optimize(key)
    
  2. szemet said

    Sorry for not solving the actual quiz, but reading this problem I’ve become curious, what would be the result if we interpret the word list as an ordered character preference vote, and run a Concorcet (in this case Schulze) method on them. http://en.wikipedia.org/wiki/Schulze_method

    If my Haskell program is correct (I haven’t tested it… :() the result is:

    “EAIORSLNTUCDGPMHBYKFWVZJXQ”

    (in this particular case I’ve simply removed duplicate characters from all word, maybe other ways of handling duplicates is possible to)

    import Data.List as L
    import Data.Map as M 
    import Data.Ord
    
    -- generate all letter pairs from word (duplicates filtered), missing letters put at the end of the list with equal weight
    votes word = [x | x<-L.subsequences w, L.length x == 2] ++ [a:[b]|a <- w,b <- "ABCDEFGHIJKLMNOPQRSTUVWXYZ" L.\\ word]
    			where w = L.nub word
    
    -- make map of letterpair frequencies
    pairMap dict = M.fromListWith (+) [(p, 1) | p <- concatMap votes $ lines dict]
    
    -- update distances using a specific letter as the intermediate element
    schulzeUpdate :: Map [Char] Int -> Char -> Map [Char] Int
    schulzeUpdate pmap k = M.mapWithKey (\[a,b] n -> max n (min (get (a:[k])) (get (k:[b])))) pmap
    		where get k =M.findWithDefault 0 k pmap
    
    -- Floyd–Warshall for widest path
    shulzeDist pmap = L.foldl' schulzeUpdate pmap "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    		
    		
    -- custom compare using the map
    comp pmap a b = compare (get (b:[a])) (get (a:[b]))
    		where get k =M.findWithDefault 0 k pmap
    		   
    main = do
    			 dict <- readFile "words.txt"
    			 putStrLn $ sortBy (comp $ shulzeDist $ pairMap dict) "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 
    
  3. szemet said

    Cleared up the code a bit (using pairs instead of lists, correcting typos, etc…) here: http://hpaste.org/87967

  4. szemet said

    Still offtopic :$ but intresetnig this Schulze voting was much ado about nothing, the result:
    EAIORSLNTUCDGPMHBYKFWVZJXQ

    is nearly the identical to the one we get from character frequencies
    ESAIORLNTUDCGPMHBYFKWVZJXQ

    If we don’t follow the requirement about handling the missing characters (they get quite much down-vote when they are not there) the result is:
    JQVFWZBPHUMKCOALTIRNESGDYX

    This should represent the usual character order in words better – I guess.

  5. Paul said

    The process can be speeded up enormously using a Trie to store all the words. Then counting the words in the right order is about a factor 10 faster. I will post code later. I get now 4046 words in correct order with key=CWBFJHLOAQUMPVXINTKGZERDYS after 134 seconds. I do not get this answer with every run, as it it a random optimization.

  6. JP said

    Interesting. I actually worked out this problem about four months ago for one of the Hard challenges on the /r/DailyProgrammer subreddit: [01/25/13] Challenge #118 [Hard] Alphabetizing cipher (albeit with a smaller dictionary).

    Given the appropriate helper functions (see blog post below), here’s my basic Racket solution (it uses hill climbing and randomization on finding a local maximum):

    ; solve via direct hill climbing
    (define (solve/hill-climbing [guess (random-key)]
                                 [score -inf.0]
                                 [overall-guess guess]
                                 [overall-score score])
      
      ; try every possible single swap
      (define-values (new-guess new-score)
        (for*/fold ([guess guess]
                    [score score])
                   ([i (in-range 26)]
                    [j (in-range i 26)])
          (define new-guess (string-swap guess i j))
          (define new-score (score-key new-guess))
          (if (> new-score score)
              (values new-guess new-score)
              (values guess score))))
    
      ; update the overall best (will actually print next round)
      (define-values (new-overall-guess new-overall-score)
        (if (>= new-score overall-score)
            (values new-guess new-score)
            (values overall-guess overall-score)))
      
      ; print out local best values and best overall
      (cond
        [(equal? guess new-guess)
         (printf "local maximum, shuffling\n")
         (for ([i (in-range (+ (random 6) 4))])
           (string-swap! guess (random 26) (random 26)))
         (define new-score (score-key new-guess))
         (printf "~a (~a)  \toverall: ~a (~a)\n" new-guess new-score overall-guess overall-score) 
         (solve/hill-climbing new-guess new-score new-overall-guess new-overall-score)]
        [else
         (printf "~a (~a)  \toverall: ~a (~a)\n" new-guess new-score overall-guess overall-score) 
         (solve/hill-climbing new-guess new-score new-overall-guess new-overall-score)]))
    

    Blog post (with the above solution and a few more zany matrix based things that never ended up going anywhere): An optimal alphabetizing cipher

    Full source (on GitHub): alphabetizing-cipher.rkt on GitHub

    (While writing this post, I’ve found a few solutions just under 4000. I’ll let it run overnight and see what happens.)

  7. szemet said

    By the way I think that this problem is equivalent with the NP-hard maximum clique problem: if we make the graph by connecting two words if there exist an alphabet such that both words have all their letters in alphabetical order, then from the cliques (where each word is connected with each other) would be easy to generate an appropriate alphabet

  8. zelrok said

    my pythons: http://pastebin.com/XgxQmF7W
    best = “JQBPCOUFVWMHAXZTRLIKNGEDSY” # 3038 words

  9. JP said

    @szemet: Yes, a maximal clique would give an optimal answer. But the runtime is so ridiculously bad… If you check my link(s) above, I implemented the Bron-Kerbosch algorithm to try and solve it that way, but after about two months the largest clique it had found was only 46 words (on the smaller dictionary). This is definitely one of the classes of problems where a statistical approach will be better. It may be possible to speed up the problem a bit more with some domain specific knowledge, but I’m not sure how.

  10. Paul said

    Here is the Python code using a trie to represent te words. The fastest solution sofar was 26 seconds with a score of 4046 and key BPCHFLJOWAQUXMVINKGTZERDYS.

    import string
    import random
    import time
    import operator
    
    _end = '_end_'
    
    t0 = time.clock()
    
    fname = "D:/Data/Development/python/src/pywork/prpr/words.txt"
    
    words = [w[:-1] for w in open(fname).readlines()]
    
    def make_trie(*words):
        root = dict()
        for word in words:
            current_dict = root
            for letter in word:
                current_dict = current_dict.setdefault(letter, {})
            current_dict = current_dict.setdefault(_end, _end)
        return root
        
    root = make_trie(*words)
    
    def random_key():
        order = range(26)
        random.shuffle(order)
        return dict(zip(string.ascii_uppercase, order))
        
    def count_ordered(trie, key, last):
        """walk the trie and count the ordered words"""
        counts = 0
        if _end in trie:
            counts += 1
        for letter in trie:
            if letter != _end:
                klet = key[letter]
                if klet >= last:
                    counts += count_ordered(trie[letter], key, klet)
        return counts
                    
    def random_char():
        return random.choice(string.ascii_uppercase)
        
    def key_swap_n(key, n):
        """ rotate n items of the key
        """
        newkey = dict(key)
        ind = [1] * n
        while n != len(set(ind)):
            ind = [random_char() for i in range(n)]
        for i, j in zip(ind, ind[1:] + [ind[0]]):
            newkey[i] = key[j]
        return newkey
    
    def score(key):
        return count_ordered(root, key, -1)
    
    def str_key(key):
        """str of the key"""
        items = key.items()
        items.sort(key=operator.itemgetter(1))
        return "".join(s for s, i in items)
    
    SWAPS = dict(zip(range(100), 90 * [2] + 7 * [3] + 3 * [5]))    
    
    def optimize(time_limit):
        """ try 10 times to get best solution
            stop after time_limit (seconds) is reached
            In practise a solution does not change oftenfter 300 secs
        """
        for n_tries in range(10):
            t1 = time.clock()
            key = random_key()
            sc = score(key)
            best_sc = sc
            for i in range(1000000):
                newkey = key_swap_n(key, SWAPS[i % 100])
                nsc = score(newkey)
                if nsc > sc - 2:
                    if nsc > best_sc:
                        best_sc = nsc
                        best_key = newkey
                        best_time = time.clock() - t1
                    key, sc = newkey, nsc
                if time.clock() - t1 > time_limit:
                    print "score={0:5d} key={1:s} time={2:4.0f} sec  niter={3:d}".format(best_sc, str_key(best_key), best_time, i)
                    break
        return key
    
    print optimize(60)
    """
    Last run:
    score= 3985 key=PCBFJWMHOURQAXVINTGZLKEDYS time=  23 sec  niter=14147
    score= 4046 key=PFJBCHLOWAQUMXVINGTZKERDYS time=  32 sec  niter=14425
    score= 4039 key=CPBFWMHJOQURAXVINTZGLKEDYS time=  21 sec  niter=13981
    score= 3907 key=BCWJPMHOUFRAVINXGTZLKEDYSQ time=  20 sec  niter=14251
    score= 4039 key=BCWPFJMHOQURAVXINTZGLKEDYS time=  42 sec  niter=13772
    score= 3919 key=PFCJWBHOURMAXVINQTGZLKEDYS time=  13 sec  niter=14026
    score= 3995 key=BPFCJHLOWAUMXVZINGTKERDYSQ time=  15 sec  niter=14418
    score= 3978 key=PCWFJBMHOAUVXINGQTZLKERDYS time=  60 sec  niter=14883
    score= 3851 key=PFBCQWMHJOAUXRTZLVKINGEDYS time=  21 sec  niter=13906
    score= 3953 key=CPFBJWMHOAUVINGXTZQLKERDYS time=  51 sec  niter=15115
    
    """
    

Leave a comment