Xref

April 22, 2011

We solved this problem once before, in the exercise on the map-reduce algorithm. Today’s solution is more straight-forward.

Since we are interested in Scheme programs, we need to be able to pick out the Scheme identifiers from a source file. A Scheme identifier consists of lower-case letters, upper-case letters, digits, and the special characters ? ! . + – * / < = > : $ % ^ & _ ~ @; that definition is fairly broad, and includes things like -1234 and </em> that you might not think of as identifiers. Here we determine if a character is part of an identifier:

(define (char-in-ident? c) ; scheme identifiers
  (or (char-alphabetic? c) (char-numeric? c)
      (member c (string->list "?!.+-*/<=>:$%^&_~@"))))

Your programming language may have different rules. Next we have the function that gets the next identifier in the input, where an identifier is a maximal sequence of characters that can be included in an identifier. Get-ident returns a null string at the end of a line, so the caller can count line numbers:

(define (get-ident)
  (let loop ((c (peek-char)) (cs '()))
    (cond ((eof-object? c) (if (null? cs) c (list->string (reverse cs))))
          ((char-in-ident? c)
            (let ((x (read-char))) (loop (peek-char) (cons x cs))))
          ((pair? cs) (list->string (reverse cs)))
          ((char=? #\newline c) (read-char) "")
          (else (read-char) (loop (peek-char) cs)))))

The main xref function loops over the input words, counting lines and accumulating a list of word/line-number pairs:

(define (xref file)
  (with-input-from-file file (lambda ()
    (let loop ((w (get-ident)) (line 1) (ws '()))
      (cond ((eof-object? w) (xref-out ws))
            ((string=? "" w) (loop (get-word) (add1 line) ws))
            (else (loop (get-word) line (cons (cons w line) ws))))))))

Once the list of word/line-number pairs is accumulated, the xref-out function sorts them on word and line-number, then writes them using a control-break on word:

(define (xref-out ws)
  (define (lt? a b)
    (or (string<? (car a) (car b))
        (and (string=? (car a) (car b))
             (< (cdr a) (cdr b)))))
  (let loop ((ws (sort lt? ws)) (prev-word "") (prev-line 0))
    (cond ((null? ws) (newline))
          ((and (string=? (caar ws) prev-word) (= (cdar ws) prev-line))
            (loop (cdr ws) prev-word prev-line))
          ((string=? (caar ws) prev-word)
            (display " ") (display (cdar ws))
            (loop (cdr ws) prev-word (cdar ws)))
          (else (when (not (string=? prev-word "")) (newline))
                (display (caar ws)) (display " ") (display (cdar ws))
                (loop (cdr ws) (caar ws) (cdar ws))))))

And that’s it. To display a cross-reference for the file xref.ss that contains the above code, with lines numbered as at http://programmingpraxis.codepad.org/h4D8umzr, say (xref "xref.ss"); the output was shown on the previous page.

About these ads

Pages: 1 2

10 Responses to “Xref”

  1. arturasl said

    Solution in C++: github
    Outputs any alphanumeric word which does not start with digit and which is neither part of comment nor literal string

  2. My Haskell solution (see http://bonsaicode.wordpress.com/2011/04/22/programming-praxis-xref/ for a version with comments):

    import Data.List
    import qualified Data.List.Key as K
    import Language.Haskell.Lexer
    
    main :: IO ()
    main = do file <- readFile "test.hs"
              mapM_ putStrLn . map ((\((n:_), ls) -> unwords $ n : nub ls) .
                  unzip) . K.group fst $ K.sort fst
                  [(s, show $ line p) | (tok, (p,s)) <- lexerPass0 file, 
                   elem tok [Varid, Conid, Varsym, Consym]]
    
  3. Graham said

    For lack of a good knowledge of Python lexical parsers:

    #!/usr/bin/env python
    
    from collections import defaultdict
    from re import compile
    
    def xref(file_name):
        exclude = compile(r"[\\\"'\(\)\[\]:?.,]")       # unwanted characters
        comment = compile(r"\#.*")                      # comments
        with open(file_name) as f:
            lines = f.readlines()
            d = defaultdict(list)
            for line_num in xrange(len(lines)):
                line = exclude.sub(' ', lines[line_num])
                line = comment.sub('', line).strip()
                for word in line.split():
                    d[word].append(line_num + 1)
        return d
    
    def output(x_d):
        for entry in sorted(str(k) + "\t" + ", ".join(str(i) for i in v) for (k, v)
                in x_d.iteritems()):
            print entry
        return None
    
    if __name__ == "__main__":
        import sys
        output(xref(sys.argv[1] if sys.argv[1:] else "xref.py"))
    
  4. Veer said

    @pp : I don’t think 0 can be considered identifier in scheme.
    Your first line output 0 as identifier.

  5. programmingpraxis said

    Veer: I don’t know if 0 is officially a Scheme identifier, or not, but it doesn’t matter. The cross-referencer is more useful if it includes 0 in the list, and that’s good enough for me.

  6. slabounty said

    Here’s a quick one in ruby …

    xref = Hash.new
    linecount = 0
    IO.foreach(ARGV[0]) do |line|
        linecount += 1
        line.sub!(/#.*/, "")
        tokens = line.split(/[^A-Za-z0-9]/)
        tokens.each do |token|
            (xref[token] ||= []) << linecount if token != ""
        end
    end
    
    xref.each_pair do |t, l|
        puts "#{t}: #{l}"
    end
    

    Someone mentioned taking out the literal strings, but my feeling was that they should be left in as part of the cross reference. For splitting, I just split on anything that wasn’t a letter, number, or underscore. I think that should get most everything correct. There are probably some edge cases though (floating point numbers come to mind).

    If anyone’s wondering, you can’t do Hash.new([]) to get an empty array as the default value of the new hash element. That will create a single array that’s shared by each hash element. Hence the funny looking line 8 which a) creates a new empty array if the hash element is empty and then b) adds the token to it (or the existing array).

  7. David said

    Here is one in Factor. A bit more complicated since any non-space printing character can be used in identifiers. Therefore it is necessary to filter out anything that is not an identifier and whatever is left must be an identifier. Lots of edge cases here, but I chose to filter out string constants, integers, end of line comments, stack comments and words ending with “:” (e.g. CONSTANT:, :, USE:, etc.), and sequence constructors ({ }, H{ }, etc.)

    ! very simple xref
    
    USING: kernel math math.order regexp ascii io.encodings.ascii formatting sorting
           io io.files assocs sequences strings ;
    IN: xref
    
    CONSTANT: xref-dict H{ }
    
    CONSTANT: content-filter  R/ \s!.*$|^!.*$||\b[0-9]+\b|\b"[^"]*"\b|\b[HV]?{\b|[][};]|\b.*:\b|\(.*--.*\)\b/
    
    : filter-content ( str -- str )
       content-filter "" re-replace ;
    
    : addref  ( symbol line -- )
        over xref-dict at
          [ rot drop ]
          [ V{ } clone [ rot xref-dict set-at ] keep ] if*
        swap suffix! drop ;
    
    : gobble-input  ( -- )
        1
        [
          filter-content  R/ \s+/ re-split
          [ >string dup length 0 >
            [ over addref ] [ drop ] if ] each
          1 +
        ] each-line drop ;
     
    : report ( -- )
        xref-dict keys [ <=> ] sort
        [ dup xref-dict at 
          swap "%s " printf [ "%d " printf ] each nl ] each ;
    
    : xref ( file -- )
        xref-dict clear-assoc
        ascii <file-reader> [ gobble-input ] with-input-stream
        report ;
    

    ( scratchpad ) “xref.factor” xref
    + 26
    30
    36
    > 24
    >string 24
    R/ 23
    \b|\\b/ 9
    \s+/ 23
    addref 14 25
    ascii 3 36
    assocs 4
    at 15 31
    clear-assoc 35
    clone 17
    content-filter 12
    drop 16 18 25 27
    dup 24 31
    each 25 32 32
    each-line 27
    filter-content 11 23
    formatting 3
    gobble-input 20 36
    if 25
    if* 17
    io 4
    io.encodings.ascii 3
    io.files 4
    keep 17
    kernel 3
    keys 30
    length 24
    math 3
    math.order 3
    nl 32
    over 15 25
    printf 32 32
    re-replace 12
    re-split 23
    regexp 3
    report 29 37
    rot 16 17
    sequences 4
    set-at 17
    sort 30
    sorting 3
    strings 4
    suffix! 18
    swap 18 32
    with-input-stream 36
    xref 5 34
    xref-dict 7 15 17 30 31 35

  8. David said

    Ooh,

    The output above looks wrong, but is correct, problem is that first three lines was interpreted as HTML. Should be:

    + 26
    <=> 30
    <file-reader> 36

  9. Rainer said

    My try in REXX, this script analyzes itself

    sym = ''
    asym. = ''
    nbr_zeilen = sourceline()
    znr = 0
    do znr = 1 to nbr_zeilen
        zeile = sourceline(znr)
        zeile = translate(zeile,'  ','()')
        do while length(zeile) > 0
            parse value zeile with first zeile
            if pos(first, sym) == 0 then sym = sym first
            asym.first = asym.first znr 
        end
    end
    
    sym = bubble(sym)
    do while length(sym) > 0
        parse value sym with first sym
        say first asym.first
    end
    exit
    
    bubble: procedure
        parse arg text
        t. = ''
        i = 0
        do while length(text) > 0
            parse value text with first text
            i = i + 1
            t.i = first
        end
        chg = 1
        do while chg == 1
            chg = 0
            do j = 1 to i-1
                do k = j+1 to i
                    if t.j > t.k then do
                        tmp = t.j
                        t.j = t.k
                        t.k = tmp
                        chg = 1
                    end
                end
            end
        end
        text = ''
        do j = 1 to i
            text = text t.j
        end
        return strip(text)
    

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

Join 575 other followers

%d bloggers like this: