Cut
August 17, 2010
Unix V7 provided a utility called cut that reads its input a line at a time and selectively copies portions of each input line to standard output. Portions are selected either by character position or by character-delimited field. Cut is invoked as cut -clist [file …] or cut -flist [-dchar] [file …].
Character mode, invoked with the -c option, retains in the output those character positions that are mentioned in the list, which may contain column numbers, or ranges of column numbers separated by a dash, all separated by commas; counting starts from one. Field mode, invoked with the -f option, specifies a list of fields in a similar manner to character mode; fields are delimited by tab characters unless the field delimiter is changed by the -d option.
For example, the command cut -f1,3 -d: /etc/passwd prints user names and userid numbers from the password file.
Your task is to write a program to implement cut. When you are finished, you are welcome to read or run a suggested solution, or to post your own solution or discuss the exercise in the comments below.
This was pretty fun to write.
#!/usr/bin/env python from optparse import OptionParser from sys import stdout, stdin, exit def parse_list(_list): ranges = [i for i in _list.split(",") if "-" in i] _list = [int(i) for i in _list.split(",") if not "-" in i] for i in range(0, len(ranges)): ranges[i] = ranges[i].split("-") for j in range(int(ranges[i][0]), int(ranges[i][1]) + 1): if not j in _list: _list.append(j) return [i for i in set(_list)] def parse_options(): parser = OptionParser( usage="usage: %prog OPTION... [FILE]...", version="%prog 0.1" ) parser.add_option( "-c", "--characters", dest="character_list", help="select only these characters", metavar="LIST", default=None ) parser.add_option( "-d", "--delimiter", dest="delimiter", help="use CHARACTER instead of TAB as a filed delimiter", metavar="CHARACTER", default="\t" ) parser.add_option( "-f", "--fields", dest="field_list", help="select only these fields", metavar="LIST", default=None ) options, args = parser.parse_args() if options.character_list and options.field_list: parser.error("options -c and -f are mutually exclusive.") if not options.character_list and not options.field_list: parser.error("you must specify a list of characters or fields.") try: if args and open(args[0], "r"): pass except IOError: parser.error("file '%s' does not exist." % args[0]) return options, args def main(): options, args = parse_options() if args: _input = open(args[0], "r").readlines() else: _input = stdin.readlines() if options.character_list: _list = parse_list(options.character_list) for line in _input: for i in _list: stdout.write(line[i - 1]) stdout.write("\n") else: _list = parse_list(options.field_list) for line in _input: line = line.split(options.delimiter) for i in _list: stdout.write(line[i - 1]) if i != _list[-1]: stdout.write(options.delimiter) stdout.write("\n") if __name__ == "__main__": main()I couldn’t help but have a try at it in Elisp. Of course I think it’s useless, but there it is:
;;; cut.el --- Dimitri Fontaine ;; ;; https://programmingpraxis.com/2010/08/17/cut/ ;; (eval-when-compile (require 'cl)) (defun dim:cut (mode list &optional delimiter) "Implement Unix cut in Emacs Lisp. For the fun of it." (unless (member mode '(char field)) (error "Cut operates in `char' or `field' mode only.")) (let* ((output (get-buffer-create "*cut*")) (ranges (mapcar (lambda (x) ;; split ranges, 1-4,5,6-8 (if (string-match "-" x) (mapcar 'string-to-int (split-string x "-")) (list (string-to-int x) (string-to-int x)))) (split-string list ","))) (content (mapcar (lambda (line) (if (eq mode 'char) line (split-string line (or delimiter "\t")))) (split-string (buffer-substring-no-properties (point-min) (point-max)) "\n")))) (with-current-buffer output (erase-buffer) (insert (loop for line in content concat (concat (loop for (b e) in ranges concat (concat (if (eq mode 'char) (if (> (length line) e) (substring line (- b 1) (- e 1)) (when (> (length line) b) (substring line (- b 1)))) ;; field based cutting (loop for i from b to e concat (nth (- i 1) line))) (when (eq mode 'field) (or delimiter "\t")))) "\n")))) (set-window-buffer (selected-window) output))) (defun cut (ranges &optional delimiter) "Interactive caller for dim:cut" (interactive (list (read-string "ranges: ") (unless current-prefix-arg (read-char "delimiter: ")))) (dim:cut (if current-prefix-arg 'char 'field) ranges (unless current-prefix-arg (char-to-string delimiter))))Here it is in ruby …
require 'getoptlong' def parse_list(list) print_list = [] split_list = list.split(',') split_list.each do |element| if element =~ /-/ first_last = element.split('-') first_last[0].to_i.upto(first_last[1].to_i) do |f| print_list << f end else print_list << element.to_i end end print_list end def parse_line_fields(line, print_list, separator) all_fields = line.split(separator) print_list.each_with_index do |f, i| print "#{separator if i>0}#{all_fields[f-1]}" if f-1 <= all_fields.size end puts end def parse_line_columns(line, print_list) print_list.each do |f| print "#{line[f-1]}" if f-1 <= line.size end puts end # Set up the command line options opts = GetoptLong.new( ["--field", "-f", GetoptLong::REQUIRED_ARGUMENT], ["--separator", "-d", GetoptLong::REQUIRED_ARGUMENT], ["--column", "-c", GetoptLong::REQUIRED_ARGUMENT], ["--verbose", "-v", GetoptLong::NO_ARGUMENT] ) # Set the default values for the options field = false separator = "\t" column = false list = "" $verbose = false # Parse the command line options. If we find one we don't recognize # an exception will be thrown and we'll rescue with a message. begin opts.each do | opt, arg| case opt when "--field" field = true list = arg when "--separator" separator = arg when "--column" column = true list = arg when "--verbose" $verbose = true end end rescue puts "Illegal command line option." exit end print_list = parse_list(list) ARGV.each do |file_name| File.open(file_name) do | file | while line = file.gets if field parse_line_fields(line, print_list, separator) else parse_line_columns(line, print_list) end end end endThis one prints the fields/columns in the order that they are given in in the command line unlike the Unix cut. I like it better this way but if you don’t then just sort the print_list on the return from parse_list(). The only other oddity is printing the separator in the field. Basically, if we decide to print a field and we’re past the first element, we’ll print the separator before the field. This makes it so we don’t have to have a separator hanging out there after the last field is printed.
Here is my complete implementation in c
http://codepad.org/QZL317EK