Natural Join

June 15, 2010

We will follow the solution given by Alfred V. Aho, Brian W. Kernighan and Peter J. Weinberger in their book The AWK Programming Language:

(define (join file1 file2)
  (let ((f1 (open-input-file file1)) (f2 (open-input-file file2)))
    (let loop ((line1 (read-line f1)) (group2 (get-group f2)))
      (cond ((or (eof-object? line1) (null? group2))
              (close-input-port f1) (close-input-port f2))
            ((string<? (prefix line1) (prefix (car group2)))
              (loop (read-line f1) group2))
            ((string<? (prefix (car group2)) (prefix line1))
              (loop line1 (get-group f2)))
            (else (do ((group2 group2 (cdr group2))) ((null? group2))
                    (display line1) (display #\tab)
                    (display (string-join #\tab (suffix (car group2))))
                    (newline))
                  (loop (read-line f1) group2))))))

The loop reads one line from the first file and one group of lines with a common key from the second file. The four conditions handle end-of-file, mis-matched keys in both directions, and matching keys. Get-group gets the next set of lines (which may be a singleton set) with a common key and returns them in a list, or the null list at end-of-file:

(define (get-group f)
  (let loop ((line (getone f)) (xs '()))
    (cond ((eof-object? line) (reverse xs))
          ((null? xs) (loop (getone f) (cons line xs)))
          ((string=? (prefix line) (prefix (car xs)))
            (loop (getone f) (cons line xs)))
          (else (unget line) (reverse xs)))))

Get-group can’t know that a line is not part of the current set until it reads the line, by which time it is too late; this is a common problem for parsers, which need to lookahead in the input to know what to do. The problem is solved by an unget function that “un-reads” an input line. Getone checks if there is an ungotten line already saved before it reads a new line, implementing a classic pushback mechanism:

(define ungot-line #f)

(define (getone f)
  (if ungot-line
      (let ((x ungot-line)) (set! ungot-line #f) x)
      (read-line f)))

(define (unget line) (set! ungot-line line))

Here are prefix and suffix, which extract the key and the remainder of the fields from an input line. They are localized here to make them easy to change, for instance if you want to join tables on the n‘th field:

(define (prefix line) (car (string-split #\tab line)))

(define (suffix line) (cdr (string-split #\tab line)))

Here’s the example from the previous page. The 0 at the end of the two system calls is the value that cat returns to the shell:

> (system "cat f1")
A       w       p
B       x       q
B       y       r
C       z       s
0
> (system "cat f2")
A       1
A       2
B       3
0
> (join "f1" "f2")
A       w       p       1
A       w       p       2
B       x       q       3
B       y       r       3

We used string-split, string-join and read-line from the Standard Prelude. You can run the program at http://programmingpraxis.codepad.org/oTmFY1uw.

Pages: 1 2

5 Responses to “Natural Join”

  1. […] Praxis – Natural Join By Remco Niemeijer In today’s Programming Praxis exercise we have to implement a program that joins two files with tables in […]

  2. Remco Niemeijer said

    My Haskell solution (see http://bonsaicode.wordpress.com/2010/06/15/programming-praxis-natural-join/ for a version with comments):

    import Data.List
    import Data.List.Split
    import System.Environment
    
    loadFile :: FilePath -> IO [[String]]
    loadFile = fmap (map (splitOn "\t") . lines) . readFile
    
    join :: Eq a => [[a]] -> [[a]] -> [[a]]
    
    join (ks1:f1) (ks2:f2) = union ks1 ks2 :
        [k1 : v1++v2 | (k1:v1) <- f1, (k2:v2) <- f2, k1 == k2]
    join _        _        = []
    
    main :: IO ()
    main = mapM_ putStrLn . map (intercalate "\t") .
           foldl1 join =<< mapM loadFile =<< getArgs
    
  3. kbob said

    Here’s a Python solution. It uses Python’s defaultdict(list) as a multimap from keys to records. make_table() and join_tables() do the work; the rest is just I/O.

    from collections import defaultdict
    
    def make_table(records):
        d = defaultdict(list)
        for rec in records:
            d[rec[0]].append(rec[1:])
        return d
    
    def join_tables(t1, t2):
        return [[k] + r1 + r2
                for k in t1
                for r1 in t1[k]
                for r2 in t2[k]]
             
    def read_file(fname):
        with open(fname) as f:
            return [line.strip('\n').split('\t') for line in f]
    
    def read_table(fname):
        return make_table(read_file(fname))
    
    def join_files(fn1, fn2):
        return join_tables(read_table(fn1), read_table(fn2))
    
    def print_records(records):
        for rec in records:
            print '\t'.join(str(f) for f in rec)
    
    print_records(join_tables(read_table('t1'), read_table('t2')))
    
  4. I’ve posted my Java solution here. The bulk of the code is I/O and the definition of rows and columns as full-fledged classes.

  5. Vikas Tandi said

    my solution in C
    http://codepad.org/k8tw70k0

Leave a comment