Cut

August 17, 2010

We begin with a function to expand ranges, which is reminiscent of a previous exercise:

(define (expand-ranges str)
  (define (make-range str)
    (let ((endpoints (map string->number (string-split #\- str))))
      (if (null? (cdr endpoints))
          (list (car endpoints))
          (range (car endpoints) (+ (cadr endpoints) 1)))))
  (apply append (map make-range (string-split #\, str))))

Cut operates in two modes. In character mode, it writes the characters corresponding to the expanded range (remember that character positions are counted from one, not zero), followed by a newline. Field mode is harder, because first the fields must be split on the delimiter, then the delimiter must be inserted between fields (but not at the end of the line):

(define (write-chars cs str)
  (do ((cs cs (cdr cs))) ((null? cs) (newline))
    (display (string-ref str (- (car cs) 1)))))

(define (write-fields fs str delim)
  (let ((fields (string-split delim str)))
    (do ((fs fs (cdr fs))) ((null? fs))
      (display (list-ref fields (- (car fs) 1)))
      (display (if (pair? (cdr fs)) delim #\newline)))))

Do-file handles a single file, regardless of character mode or field mode, leaving the task of setting the current input port to the caller. The two legs of the if each handle one mode, using a do loop to process each line individually:

(define (do-file opts)
  (if (assoc #\c opts)
      (let ((cs (expand-ranges (cdr (assoc #\c opts)))))
        (do ((line (read-line) (read-line)))
            ((eof-object? line))
          (write-chars cs line)))
      (let ((fs (expand-ranges (cdr (assoc #\f opts))))
            (delim (string-ref (cdr (assoc #\d opts)) 0)))
        (do ((line (read-line) (read-line)))
            ((eof-object? line))
          (write-fields fs line delim)))))

All that’s left is the main program, which extracts parameters from the command line, then calls do-file to handle the current input port if there are no files on the command line, or processes the files individually in a do loop if one or more files are named on the command line:

(let-values (((opts files) (getopt "c:d:f:"
    "usage: cut -clist [file ...] or cut -flist [-dchar] [file ...]"
    (cdr (command-line)))))
  (if (null? files) (do-file opts)
    (do ((files files (cdr files))) ((null? files))
      (with-input-from-file (car files) (lambda () (do-file opts))))))

Note that command-line is specific to Chez Scheme, and must change for other Scheme implementations. We used range, read-line, and string-split from the Standard Prelude, and getopt from an earlier exercise. You can see the program assembled at http://programmingpraxis.codepad.org/U3Z6l5bV.

Advertisement

Pages: 1 2

4 Responses to “Cut”

  1. Bogdan Popa said

    This was pretty fun to write.

    #!/usr/bin/env python
    from optparse import OptionParser
    from sys import stdout, stdin, exit
    
    def parse_list(_list):
    	ranges = [i for i in _list.split(",") if "-" in i]
    	_list = [int(i) for i in _list.split(",") if not "-" in i]
    
    	for i in range(0, len(ranges)):
    		ranges[i] = ranges[i].split("-")
    
    		for j in range(int(ranges[i][0]), int(ranges[i][1]) + 1):
    			if not j in _list:
    				_list.append(j)
    
    	return [i for i in set(_list)]
    
    def parse_options():
    	parser = OptionParser(
    			usage="usage: %prog OPTION... [FILE]...",
    			version="%prog 0.1"
    	)
    	parser.add_option(
    			"-c", "--characters",
    			dest="character_list",
    			help="select only these characters",
    			metavar="LIST",
    			default=None
    	)
    	parser.add_option(
    			"-d", "--delimiter",
    			dest="delimiter",
    			help="use CHARACTER instead of TAB as a filed delimiter",
    			metavar="CHARACTER",
    			default="\t"
    	)
    	parser.add_option(
    			"-f", "--fields",
    			dest="field_list",
    			help="select only these fields",
    			metavar="LIST",
    			default=None
    	)
    
    	options, args = parser.parse_args()
    
    	if options.character_list and options.field_list:
    		parser.error("options -c and -f are mutually exclusive.")
    
    	if not options.character_list and not options.field_list:
    		parser.error("you must specify a list of characters or fields.")
    
    	try:
    		if args and open(args[0], "r"):
    			pass
    	except IOError:
    		parser.error("file '%s' does not exist." % args[0])
    
    	return options, args
    
    def main():
    	options, args = parse_options()
    
    	if args:
    		_input = open(args[0], "r").readlines()
    	else:
    		_input = stdin.readlines()
    
    	if options.character_list:
    		_list = parse_list(options.character_list)
    
    		for line in _input:
    			for i in _list:
    				stdout.write(line[i - 1])
    
    			stdout.write("\n")
    	else:
    		_list = parse_list(options.field_list)
    
    		for line in _input:
    			line = line.split(options.delimiter)
    
    			for i in _list:
    				stdout.write(line[i - 1])
    
    				if i != _list[-1]:
    					stdout.write(options.delimiter)
    
    			stdout.write("\n")
    
    if __name__ == "__main__":
    	main()
    
  2. dim said

    I couldn’t help but have a try at it in Elisp. Of course I think it’s useless, but there it is:

    ;;; cut.el --- Dimitri Fontaine
    ;;
    ;; https://programmingpraxis.com/2010/08/17/cut/
    ;;
    (eval-when-compile (require 'cl))
    
    (defun dim:cut (mode list &optional delimiter)
      "Implement Unix cut in Emacs Lisp. For the fun of it."
      (unless (member mode '(char field))
        (error "Cut operates in `char' or `field' mode only."))
      (let* ((output (get-buffer-create "*cut*"))
             (ranges (mapcar (lambda (x)
                               ;; split ranges, 1-4,5,6-8
                               (if (string-match "-" x)
                                   (mapcar 'string-to-int (split-string x "-"))
                                 (list (string-to-int x) (string-to-int x))))
                             (split-string list ",")))
             (content (mapcar (lambda (line)
                                (if (eq mode 'char) line
                                  (split-string line (or delimiter "\t"))))
                              (split-string
                               (buffer-substring-no-properties (point-min) (point-max)) "\n"))))
        (with-current-buffer output
          (erase-buffer)
          (insert
           (loop for line in content
                 concat (concat
                         (loop for (b e) in ranges
                               concat (concat
                                       (if (eq mode 'char)
                                           (if (> (length line) e)
                                               (substring line (- b 1) (- e 1))
                                             (when (> (length line) b)
                                               (substring line (- b 1))))
                                         ;; field based cutting
                                         (loop for i from b to e
                                               concat (nth (- i 1) line)))
                                       (when (eq mode 'field) (or delimiter "\t"))))
                         "\n"))))
        (set-window-buffer (selected-window) output)))
    
    (defun cut (ranges &optional delimiter)
      "Interactive caller for dim:cut"
      (interactive (list (read-string "ranges: ")
                         (unless current-prefix-arg
                           (read-char "delimiter: "))))
      (dim:cut (if current-prefix-arg 'char 'field)
               ranges
               (unless current-prefix-arg (char-to-string delimiter))))
    

  3. slabounty said

    Here it is in ruby …

    require 'getoptlong'
    
    def parse_list(list)
        print_list = []
        split_list = list.split(',')
        split_list.each do |element|
            if element =~ /-/
                first_last = element.split('-')
                first_last[0].to_i.upto(first_last[1].to_i) do |f|
                    print_list << f
                end
            else
                print_list << element.to_i
            end
        end
        print_list
    end
    
    def parse_line_fields(line, print_list, separator)
        all_fields = line.split(separator)
        print_list.each_with_index do |f, i|
            print "#{separator if i>0}#{all_fields[f-1]}" if f-1 <= all_fields.size
        end
        puts
    end
    
    def parse_line_columns(line, print_list)
        print_list.each do |f|
            print "#{line[f-1]}" if f-1 <= line.size
        end
        puts
    end
    
    # Set up the command line options
    opts = GetoptLong.new(
        ["--field", "-f", GetoptLong::REQUIRED_ARGUMENT],
        ["--separator", "-d", GetoptLong::REQUIRED_ARGUMENT],
        ["--column", "-c", GetoptLong::REQUIRED_ARGUMENT],
        ["--verbose", "-v", GetoptLong::NO_ARGUMENT]
        )
    
    # Set the default values for the options
    field = false
    separator = "\t"
    column = false
    list = ""
    $verbose = false
    
    # Parse the command line options. If we find one we don't recognize
    # an exception will be thrown and we'll rescue with a message.
    begin
        opts.each do | opt, arg|
            case opt
            when "--field"
                field = true
                list = arg
            when "--separator"
                separator = arg
            when "--column"
                column = true
                list = arg
            when "--verbose"
                $verbose = true
            end
        end
    rescue
        puts "Illegal command line option."
        exit
    end
    
    print_list = parse_list(list)
    
    ARGV.each do |file_name|
        File.open(file_name) do | file |
            while line = file.gets
                if field
                    parse_line_fields(line, print_list, separator)
                else
                    parse_line_columns(line, print_list)
                end
            end
        end
    end
    

    This one prints the fields/columns in the order that they are given in in the command line unlike the Unix cut. I like it better this way but if you don’t then just sort the print_list on the return from parse_list(). The only other oddity is printing the separator in the field. Basically, if we decide to print a field and we’re past the first element, we’ll print the separator before the field. This makes it so we don’t have to have a separator hanging out there after the last field is printed.

  4. Vikas Tandi said

    Here is my complete implementation in c
    http://codepad.org/QZL317EK

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: