Ternary Search Tries

June 5, 2009

We represent a ternary search trie as a collection of nodes, each a six-slot vector: a boolean indicator whether or not the node has a value, the value itself, the split character that splits the trie into less-than, equal-to, and greater-than parts, and three child nodes:

(define (node v? v s l e h) (vector v? v s l e h))
(define nil (vector #f #f (integer->char 0) (vector) (vector) (vector)))
(define (nil? tst) (eqv? tst nil))
(define (val? tst) (vector-ref tst 0))
(define (val tst) (vector-ref tst 1))
(define (split tst) (vector-ref tst 2))
(define (lokid tst) (vector-ref tst 3))
(define (eqkid tst) (vector-ref tst 4))
(define (hikid tst) (vector-ref tst 5))

We begin with lookup, which searches for an item in a ternary search trie. Lookup is recursive, with two base cases — a nil trie (failure) and a null key (success, if the current node contains a value) — and three recursive cases, one each for less-than, greater-than, and equal-to:

(define (lookup t k)
  (cond ((nil? t) #f)
        ((null? k) (if (val? t) (val t) #f))
        ((char<? (car k) (split t)) (lookup (lokid t) k))
        ((char<? (split t) (car k)) (lookup (hikid t) k))
        (else (lookup (eqkid t) (cdr k)))))

Insert builds a new trie, so it is fully applicative; the cases are the same as for lookup. The nil case is interesting; insert builds a new node, then calls itself recursively without changing any of its arguments, so that at the next call the trie will have the node it needs to continue:

(define (insert t k v)
  (cond ((nil? t) (insert (node #f #f (if (null? k) (integer->char 0) (car k)) nil nil nil) k v))
        ((null? k) (node #t v (split t) (lokid t) (eqkid t) (hikid t)))
        ((char<? (car k) (split t)) (node (val? t) (val t) (split t) (insert (lokid t) k v) (eqkid t) (hikid t)))
        ((char<? (split t) (car k)) (node (val? t) (val t) (split t) (lokid t) (eqkid t) (insert (hikid t) k v)))
        (else (node (val? t) (val t) (split t) (lokid t) (insert (eqkid t) (cdr k) v) (hikid t)))))

Update is a simple variant on insert:

(define (update t k p v)
  (cond ((nil? t) (update (node #f #f (if (null? k) (integer->char 0) (car k)) nil nil nil) k p v))
        ((null? k) (if (val? t) (node #t (p k (val t)) (split t) (lokid t) (eqkid t) (hikid t))
                                (node #t v (split t) (lokid t) (eqkid t) (hikid t))))
        ((char<? (car k) (split t)) (node (val? t) (val t) (split t) (update (lokid t) k p v) (eqkid t) (hikid t)))
        ((char<? (split t) (car k)) (node (val? t) (val t) (split t) (lokid t) (eqkid t) (update (hikid t) k p v)))
        (else (node (val? t) (val t) (split t) (lokid t) (update (eqkid t) (cdr k) p v) (hikid t)))))

Delete is interesting. If the key isn’t already in the trie, the nil case will eventually stop the recursion without changing the trie. If the key is in the trie, the null case sets the value indicator to #f. This means that an insert followed immediately by a delete of the just-inserted key will potentially leave the trie in a different state than when it started, as the “ghost” structure of the deleted nodes remains. It is possible, though at considerable cost, to remove the ghost nodes; we don’t bother:

(define (delete t k)
  (cond ((nil? t) t)
        ((null? k) (node #f #f (split t) (lokid t) (eqkid t) (hikid t)))
        ((char<? (car k) (split t)) (node (val? t) (val t) (split t) (delete (lokid t) k) (eqkid t) (hikid t)))
        ((char<? (split t) (car k)) (node (val? t) (val t) (split t) (lokid t) (eqkid t) (delete (hikid t) k)))
        (else (node (val? t) (val t) (split t) (lokid t) (delete (eqkid t) (cdr k)) (hikid t)))))

Enlist traverses the trie in order:

(define (enlist t)
  (let enlist ((t t) (k '()))
    (if (nil? t) '()
      (append (enlist (lokid t) k)
              (if (val? t)
                  (cons (cons (list->string (reverse k)) (val t))
                        (enlist (eqkid t) (cons (split t) k)))
                  (enlist (eqkid t) (cons (split t) k)))
              (enlist (hikid t) k)))))

An example of the use of ternary search tries is given by the earlier word-frequencies exercise. Here is a restatement of the central function of that exercise:

(define (word-freq n file-name)
  (define (freq-gt? a b) (> (cdr a) (cdr b)))
  (with-input-from-file file-name
    (lambda ()
      (let loop ((word (read-word)) (freqs nil))
        (if (eof-object? word)
            (take n (sort freq-gt? (enlist freqs)))
            (loop (read-word) (update freqs (string->list word) (lambda (k v) (+ v 1)) 1)))))))

Take is from the Standard Prelude, read-word is from the earlier exercise. When called with all its declarations, word-freq produces the same output as the earlier exercise:

> (word-freq 25 "bible.txt")
(("the" . 62588) ("and" . 30875) ("of" . 30183)
 ("to" . 23023) ("you" . 14887) ("in" . 13357) ("he" . 10495)
 ("a" . 10150) ("i" . 9078) ("for" . 8983) ("his" . 8424)
 ("lord" . 8129) ("your" . 7398) ("with" . 7259)
 ("that" . 7187) ("is" . 7143) ("they" . 7005) ("not" . 6484)
 ("him" . 6140) ("will" . 6093) ("them" . 5831) ("be" . 5668)
 ("who" . 5611) ("from" . 5476) ("it" . 5395))

You can see all the code at http://programmingpraxis.codepad.org/UnTljjnx.

Pages: 1 2

4 Responses to “Ternary Search Tries”

  1. […] Praxis – Ternary Search Tries By Remco Niemeijer Today’s Programming Praxis problem is about Ternary search tries, which are basically hashmaps of strings […]

  2. Remco Niemeijer said

    My Haskell solution (see http://bonsaicode.wordpress.com/2009/06/05/programming-praxis-ternary-search-tries/ for a version with comments):

    import Data.Char
    import qualified Data.List.Key as K
    import Prelude hiding (lookup)

    data TernaryTrie k v = Empty | Node { val :: Maybe v,
    split :: [k], lb :: !(TernaryTrie k v),
    eb :: !(TernaryTrie k v), gb :: !(TernaryTrie k v) }

    lookup :: Ord k => [k] -> TernaryTrie k v -> Maybe v
    lookup _ Empty = Nothing
    lookup [] t = val t
    lookup (x:xs) t = case compare [x] $ split t of
    GT -> lookup (x:xs) $ gb t
    LT -> lookup (x:xs) $ lb t
    EQ -> lookup xs $ eb t

    insert :: Ord k => [k] -> v -> TernaryTrie k v -> TernaryTrie k v
    insert k v Empty = insert k v $
    Node Nothing (take 1 k) Empty Empty Empty
    insert [] v t = t { val = Just v }
    insert k v t = modify (flip insert v) k t

    update :: Ord k => [k] -> v -> (v -> v) ->
    TernaryTrie k v -> TernaryTrie k v
    update k v _ Empty = insert k v Empty
    update [] v p t = t { val = Just . maybe v p $ val t }
    update k v p t = modify (\x -> update x v p) k t

    delete :: Ord k => [k] -> TernaryTrie k v -> TernaryTrie k v
    delete _ Empty = Empty
    delete [] t = t { val = Nothing }
    delete k t = modify delete k t

    modify :: Ord k => ([k] -> TernaryTrie k v -> TernaryTrie k v) ->
    [k] -> TernaryTrie k v -> TernaryTrie k v
    modify f k t = case compare (take 1 k) (split t) of
    LT -> t { lb = f (drop 0 k) $ lb t }
    EQ -> t { eb = f (drop 1 k) $ eb t }
    GT -> t { gb = f (drop 0 k) $ gb t }

    enlist :: TernaryTrie k v -> [([k], v)]
    enlist = enlist’ [] where
    enlist’ _ Empty = []
    enlist’ k t =
    maybe [] (\v -> [(k, v)]) (val t) ++ enlist’ k (lb t) ++
    enlist’ (k ++ split t) (eb t) ++ enlist’ k (gb t)

    main :: IO ()
    main = print . take 25 . reverse . K.sort snd . enlist .
    foldl (\t k -> update k 1 succ t) Empty .
    map (map toLower . filter isAlpha) . words =<< readFile "bible.txt" [/sourcecode]

  3. Vikas Tandi said

    implemented in C

    #include <stdlib.h>
    
    typedef struct TernaryTrieNode
    {
    	char c;
    	struct TernaryTrieNode *left;
    	struct TernaryTrieNode *mid;
    	struct TernaryTrieNode *right;
    }TernaryTrie;
    
    static TernaryTrie* create_node(char c);
    static int TernaryTrie_search_imp(TernaryTrie *p, char *s, int pos);
    static TernaryTrie* TernaryTrie_insert_imp(TernaryTrie *p, char *s, int pos);
    
    TernaryTrie* TernaryTrie_init()
    {
    	TernaryTrie *head;
    
    	head = create_node(0);
    	if(head == NULL)
    		return NULL;
    
    	head->left = head->right = head;
    	return head;
    }
    
    int TernaryTrie_search(TernaryTrie *p, char *s)
    {
    	if(s == NULL || s[0] == '\0')
    		return 0;
    	return TernaryTrie_search_imp(p->mid, s, 0);
    }
    
    static int TernaryTrie_search_imp(TernaryTrie *p, char *s, int pos)
    {
    	if(p == NULL)
    		return 0;
    	if(s[pos] == '\0')
    		return 1;
    	if(s[pos] < p->c)
    		return TernaryTrie_search_imp(p->left, s, pos);
    	if(s[pos] == p->c)
    		return TernaryTrie_search_imp(p->mid, s, pos+1);
    	if(s[pos] > p->c)
    		return TernaryTrie_search_imp(p->right, s, pos);
    }
    
    TernaryTrie* TernaryTrie_insert(TernaryTrie *p, char *s)
    {
    	if(s == NULL || s[0] == '\0')
    		return p;
    	p->mid = TernaryTrie_insert_imp(p->mid, s, 0);
    	return p;
    }
    
    static TernaryTrie* TernaryTrie_insert_imp(TernaryTrie *p, char *s, int pos)
    {
    	if(p == NULL)
    	{
    		p = create_node(s[pos]);
    		if(p == NULL)
    			return NULL;
    	}
    	if(s[pos] == '\0')
    		return p;
    
    	if(s[pos] < p->c)
    		p->left = TernaryTrie_insert_imp(p->left, s, pos);
    	if(s[pos] == p->c)
    		p->mid = TernaryTrie_insert_imp(p->mid, s, pos+1);
    	if(s[pos] > p->c)
    		p->right = TernaryTrie_insert_imp(p->right, s, pos);
    
    	return p;
    }
    
    static TernaryTrie* create_node(char c)
    {
    	TernaryTrie *p;
    
    	p = (TernaryTrie*)malloc(sizeof(*p));
    	if(p == NULL)
    		return NULL;
    
    	p->c = c;
    	p->left = p->right = p->mid = NULL;
    	return p;
    }
    
  4. John Cowan said

    You can save a third of the storage in Scheme (or any dynamically typed language) by imposing a trivial constraint: the value can’t be a character. By using a unique object to represent “no value” (I personally like to use (string-copy “something meaningful”), which is guaranteed to generate a new string object)), then you can use one slot for the Boolean, the value, and the split character.

Leave a comment