Cut

August 17, 2010

Unix V7 provided a utility called cut that reads its input a line at a time and selectively copies portions of each input line to standard output. Portions are selected either by character position or by character-delimited field. Cut is invoked as cut -clist [file …] or cut -flist [-dchar] [file …].

Character mode, invoked with the -c option, retains in the output those character positions that are mentioned in the list, which may contain column numbers, or ranges of column numbers separated by a dash, all separated by commas; counting starts from one. Field mode, invoked with the -f option, specifies a list of fields in a similar manner to character mode; fields are delimited by tab characters unless the field delimiter is changed by the -d option.

For example, the command cut -f1,3 -d: /etc/passwd prints user names and userid numbers from the password file.

Your task is to write a program to implement cut. When you are finished, you are welcome to read or run a suggested solution, or to post your own solution or discuss the exercise in the comments below.

Pages: 1 2

4 Responses to “Cut”

  1. Bogdan Popa said

    This was pretty fun to write.

    #!/usr/bin/env python
    from optparse import OptionParser
    from sys import stdout, stdin, exit
    
    def parse_list(_list):
    	ranges = [i for i in _list.split(",") if "-" in i]
    	_list = [int(i) for i in _list.split(",") if not "-" in i]
    
    	for i in range(0, len(ranges)):
    		ranges[i] = ranges[i].split("-")
    
    		for j in range(int(ranges[i][0]), int(ranges[i][1]) + 1):
    			if not j in _list:
    				_list.append(j)
    
    	return [i for i in set(_list)]
    
    def parse_options():
    	parser = OptionParser(
    			usage="usage: %prog OPTION... [FILE]...",
    			version="%prog 0.1"
    	)
    	parser.add_option(
    			"-c", "--characters",
    			dest="character_list",
    			help="select only these characters",
    			metavar="LIST",
    			default=None
    	)
    	parser.add_option(
    			"-d", "--delimiter",
    			dest="delimiter",
    			help="use CHARACTER instead of TAB as a filed delimiter",
    			metavar="CHARACTER",
    			default="\t"
    	)
    	parser.add_option(
    			"-f", "--fields",
    			dest="field_list",
    			help="select only these fields",
    			metavar="LIST",
    			default=None
    	)
    
    	options, args = parser.parse_args()
    
    	if options.character_list and options.field_list:
    		parser.error("options -c and -f are mutually exclusive.")
    
    	if not options.character_list and not options.field_list:
    		parser.error("you must specify a list of characters or fields.")
    
    	try:
    		if args and open(args[0], "r"):
    			pass
    	except IOError:
    		parser.error("file '%s' does not exist." % args[0])
    
    	return options, args
    
    def main():
    	options, args = parse_options()
    
    	if args:
    		_input = open(args[0], "r").readlines()
    	else:
    		_input = stdin.readlines()
    
    	if options.character_list:
    		_list = parse_list(options.character_list)
    
    		for line in _input:
    			for i in _list:
    				stdout.write(line[i - 1])
    
    			stdout.write("\n")
    	else:
    		_list = parse_list(options.field_list)
    
    		for line in _input:
    			line = line.split(options.delimiter)
    
    			for i in _list:
    				stdout.write(line[i - 1])
    
    				if i != _list[-1]:
    					stdout.write(options.delimiter)
    
    			stdout.write("\n")
    
    if __name__ == "__main__":
    	main()
    
  2. dim said

    I couldn’t help but have a try at it in Elisp. Of course I think it’s useless, but there it is:

    ;;; cut.el --- Dimitri Fontaine
    ;;
    ;; https://programmingpraxis.com/2010/08/17/cut/
    ;;
    (eval-when-compile (require 'cl))
    
    (defun dim:cut (mode list &optional delimiter)
      "Implement Unix cut in Emacs Lisp. For the fun of it."
      (unless (member mode '(char field))
        (error "Cut operates in `char' or `field' mode only."))
      (let* ((output (get-buffer-create "*cut*"))
             (ranges (mapcar (lambda (x)
                               ;; split ranges, 1-4,5,6-8
                               (if (string-match "-" x)
                                   (mapcar 'string-to-int (split-string x "-"))
                                 (list (string-to-int x) (string-to-int x))))
                             (split-string list ",")))
             (content (mapcar (lambda (line)
                                (if (eq mode 'char) line
                                  (split-string line (or delimiter "\t"))))
                              (split-string
                               (buffer-substring-no-properties (point-min) (point-max)) "\n"))))
        (with-current-buffer output
          (erase-buffer)
          (insert
           (loop for line in content
                 concat (concat
                         (loop for (b e) in ranges
                               concat (concat
                                       (if (eq mode 'char)
                                           (if (> (length line) e)
                                               (substring line (- b 1) (- e 1))
                                             (when (> (length line) b)
                                               (substring line (- b 1))))
                                         ;; field based cutting
                                         (loop for i from b to e
                                               concat (nth (- i 1) line)))
                                       (when (eq mode 'field) (or delimiter "\t"))))
                         "\n"))))
        (set-window-buffer (selected-window) output)))
    
    (defun cut (ranges &optional delimiter)
      "Interactive caller for dim:cut"
      (interactive (list (read-string "ranges: ")
                         (unless current-prefix-arg
                           (read-char "delimiter: "))))
      (dim:cut (if current-prefix-arg 'char 'field)
               ranges
               (unless current-prefix-arg (char-to-string delimiter))))
    

  3. slabounty said

    Here it is in ruby …

    require 'getoptlong'
    
    def parse_list(list)
        print_list = []
        split_list = list.split(',')
        split_list.each do |element|
            if element =~ /-/
                first_last = element.split('-')
                first_last[0].to_i.upto(first_last[1].to_i) do |f|
                    print_list << f
                end
            else
                print_list << element.to_i
            end
        end
        print_list
    end
    
    def parse_line_fields(line, print_list, separator)
        all_fields = line.split(separator)
        print_list.each_with_index do |f, i|
            print "#{separator if i>0}#{all_fields[f-1]}" if f-1 <= all_fields.size
        end
        puts
    end
    
    def parse_line_columns(line, print_list)
        print_list.each do |f|
            print "#{line[f-1]}" if f-1 <= line.size
        end
        puts
    end
    
    # Set up the command line options
    opts = GetoptLong.new(
        ["--field", "-f", GetoptLong::REQUIRED_ARGUMENT],
        ["--separator", "-d", GetoptLong::REQUIRED_ARGUMENT],
        ["--column", "-c", GetoptLong::REQUIRED_ARGUMENT],
        ["--verbose", "-v", GetoptLong::NO_ARGUMENT]
        )
    
    # Set the default values for the options
    field = false
    separator = "\t"
    column = false
    list = ""
    $verbose = false
    
    # Parse the command line options. If we find one we don't recognize
    # an exception will be thrown and we'll rescue with a message.
    begin
        opts.each do | opt, arg|
            case opt
            when "--field"
                field = true
                list = arg
            when "--separator"
                separator = arg
            when "--column"
                column = true
                list = arg
            when "--verbose"
                $verbose = true
            end
        end
    rescue
        puts "Illegal command line option."
        exit
    end
    
    print_list = parse_list(list)
    
    ARGV.each do |file_name|
        File.open(file_name) do | file |
            while line = file.gets
                if field
                    parse_line_fields(line, print_list, separator)
                else
                    parse_line_columns(line, print_list)
                end
            end
        end
    end
    

    This one prints the fields/columns in the order that they are given in in the command line unlike the Unix cut. I like it better this way but if you don’t then just sort the print_list on the return from parse_list(). The only other oddity is printing the separator in the field. Basically, if we decide to print a field and we’re past the first element, we’ll print the separator before the field. This makes it so we don’t have to have a separator hanging out there after the last field is printed.

  4. Vikas Tandi said

    Here is my complete implementation in c
    http://codepad.org/QZL317EK

Leave a comment