Assembler, Part 1
April 15, 2014
In this exercise and the next one we will write a simple assembler for a hypothetical computer; we are following the assembler in the book The Awk Programming Langauge by Alfred V. Aho, Brian W. Kernighan and Peter J. Weinberger. Our computer has a memory of a thousand five-digit words, a single accumulator, and eleven opcodes:
| OPCODE | INSTRUCTION | DESCRIPTION |
|---|---|---|
00 |
const C |
assembler pseudo-operator to define a constant C |
01 |
get |
read a number from the input to the accumulator |
02 |
put |
write the number in the accumulator to the output |
03 |
ld M |
load accumulator with contents of memory location M |
04 |
st M |
store contents of accumulator in memory location M |
05 |
add M |
add contents of memory location M to the accumulator |
06 |
sub M |
subtract contents of memory location M from the accumulator |
07 |
jpos M |
jump to memory location M if the accumulator is positive |
08 |
jz M |
jump to memory location M if the accumulator is zero |
09 |
j |
jump to memory location M, unconditionally |
10 |
halt |
stop program execution |
An assembly-language program is a series of blank lines and statements consisting of up to four fields: The first field, if it exists, is a label; it must start at the first position on the line and may not start with a digit. The second field, which is mandatory, is the opcode; it follows the optional label and mandatory spaces. The third field, which is used only with some opcodes, is the object; if it is present, it follows the opcode and mandatory spaces. The fourth field, which is optional, is a comment; everything on the line following a hash-sign is ignored. Here is a sample assembly-language program that prints the sum of a series of integers entered by the user, with the end of input marked by a 0:
# print sum of input numbers (terminated by zero)
ld zero # initialize sum to zero
st sum
loop get # read a number
jz done # no more input if number is zero
add sum # add input to accumulated sum
st sum # store new value back in sum
j loop # go back and read another number
done ld sum # print sum
put
halt
zero const 0
sum const
The contents of memory after loading the sample program are shown on the next page.
Your task is to write a program that assembles a program written in our simple assembly language and loads the program into memory; we’ll write a simulator for our hypothetical computer in the next exercise. When you are finished, you are welcome to read or run a suggested solution, or to post your own solution or discuss the exercise in the comments below.
Solution in Racket (https://gist.github.com/adolfopa/11063222#file-asm-1-notests-rkt)
#lang racket/base (require racket/dict racket/match racket/sequence racket/set racket/string) (define instructions (hash "const" 00 "get" 01 "put" 02 "ld" 03 "st" 04 "add" 05 "sub" 06 "jpos" 07 "jz" 08 "j" 09 "halt" 10)) (define instr-regexp (string-join (map regexp-quote (sort (hash-keys instructions) > #:key string-length)) "|" #:before-first "(" #:after-last ")")) (define stmt-regexp (pregexp (string-append "[ \t]*" ; leading whitespace "(?:" ; (1) "(?:(\\w+)[ \t]+)??" ; optional label instr-regexp ; instruction mnemonic "(?:[ \t]+(\\d+|\\w+))?" ; optional argument ")?" ; (1) "(?:#.*)?"))) ; optional comment (struct stmt (label op arg) #:transparent) (define (read-stmt line) (match (regexp-match stmt-regexp line) [(list _ fst snd thd) (when (and snd (not (dict-ref instructions snd #f))) (error "syntax error: unknown mnemonic" line)) (and (or fst snd thd) (stmt fst snd thd))] [#f (error "syntax error: ill formed statement" line)])) (define (read-asm in) (for*/list ([line (in-lines in)] [st (in-value (read-stmt line))] #:when st) st)) (define (make-symbol-table xs) (for/hash ([x xs] [n (in-naturals)] #:when (stmt-label x)) (values (stmt-label x) n))) (define (resolve arg labels) (cond [(not arg) 0] [(regexp-match #px"^\\d+$" arg) (string->number arg)] [else (dict-ref labels arg)])) (define (encode op arg labels) (+ (* (dict-ref instructions op) 1000) (if arg (resolve arg labels) 0))) (define (assemble prog [mem (make-vector 1000 0)]) (define stmts (read-asm prog)) (define labels (make-symbol-table stmts)) (for ([x stmts] [i (in-naturals)]) (match x [(stmt _ "const" arg) (vector-set! mem i (resolve arg labels))] [(stmt _ op arg) (vector-set! mem i (encode op arg labels))])) mem)Both one and two pass assemblers in Python.
import collections import itertools as it import re mnemonic = "const get put ld st add sub jpos jz j halt".split() opcode = {m:c for c,m in enumerate(mnemonic)} regex = re.compile(r"""(?x) # verbose mode regex \A # beginning of line (?P<label>[a-zA-Z_]\w*)? # optional label--no leading whitespace (?:\s+(?P<opcode>[a-zA-Z]+) # optional opcode after 1 or more ws (?:\s+(?P<operand>\w+))?)? # followed by an optional operand \s* # trailing ws (?:\#.*)? # optional comment \Z # end of line """) EBadArg = "Error: '{label}' is referenced but not defined." EBadLabel = "Error: Duplicate label '{label}' at line {line_no}." EBadOp = "Error: Unrecognized instruction mnuemonic: '{op}'." ETooLong = "Error: Program is too long." WUnusedLabel = "Warning: Unused label: '{label}'." def perror(fmt, *args, **kwds): print(fmt.format(*args, **kwds)) def one_pass(source, listing=False): class Symbol: def __init__(self): self.addr = None self.xref = [] symtab = collections.defaultdict(Symbol) memory = [0] * 1000 warnings = 0 errors = 0 pc = 0 for line_no, line in enumerate(src): mo = regex.match(line) label, op, arg = mo.groups() if label: if symtab[label].addr is not None: perror(EBadLabel, label=label, line_no=line_no) errors += 1 # save label address and fix earlier references symtab[label].addr = pc for addr in symtab[label].xref: memory[addr] += pc if op: instruction = opcode.get(op, None) if instruction is None: perror(EBadOp, op=op) errors += 1 if arg: if arg.isdigit(): val = int(arg) else: symtab[arg].xref.append(pc) val = symtab[arg].addr or 0 else: val = 0 if pc > 999 and not toolong: perror(ETooLong) errors += 1 toolong = True if not errors: memory[pc] = instruction * 1000 + val pc += 1 for label, info in symtab.items(): if info.addr is None: perror(EBadArg, label=label) errors += 1 if info.xref == []: perror(WUnusedLabel, label=label) warnings += 1 if errors or warnings: print("There were {} errors and {} warnings.".format(errors, warnings)) return None else: return memory def two_pass(source, listing=False): parsed = [] address = {} memory = [0] * 1000 # first pass pc = 0 for line_no, line in enumerate(source): label, op, arg = regex.match(line).groups() parsed.append((label, op, arg, line_no, line)) if label and label not in address: address[label] = pc if op: pc += 1 # second pass pc = 0 toolong = False errors = [] nerrors = 0 if listing: print("pc code line source\n--- ----- ----- ----------------") for label, op, arg, line_no, line in parsed: if label and address[label] != pc: errors.append((EBadLabel, {'label':label, 'line_no':line_no})) if op: instruction = opcode.get(op, None) if arg: val = int(arg) if arg.isdigit() else address.get(arg, None) else: val = 0 if instruction is None: errors.append((EBadOp, {'op':op})) if val is None: errors.append((EBadArg, {'label':arg})) if pc > 999 and not toolong: errors.append((ETooLong, {})) toolong = True if not errors: memory[pc] = instruction * 1000 + val if listing: fmt = "{:03} {:05} {:05} {}" print(fmt.format(pc, memory[pc], line_no, line)) if errors: nerrors += len(errors) if not listing: fmt = "{:05} {}" print(fmt.format(line_no, line)) for error, kws in errors: perror(error, **kws) errors = [] print() pc += 1 else: if listing: fmt = " {:05} {}" print(fmt.format(line_no, line)) return None if nerrors > 0 else memory(def C
{:M (make-array String 1000)
:acc 0
:opc (apply hash-map
(mapcat list
[“const” “get” “put” “ld” “st” “add” “sub” “jpos” “jz” “j” “halt”]
(map #(format “%02d” %) (range))))})
(defn reader
(let [re #”^(\w\S*)?\s+(\w+)\s*(\w*)”
whitespace #”^(#.*)?$”
code-lines (remove #(re-matches whitespace %)
(clojure.string/split code #”\n”))]
(map #(apply vector (rest (re-find re %))) code-lines)))
(defn label-def? [line] (first line))
(defn get-labels
(reduce
(fn [r [[label opc arg] pos]]
(assoc r label pos))
{“” 0}
(filter #(label-def? (first %))
(map list code (range)))))
(defn translate [opcodes source-code]
(let
(reduce
(fn [r [_ opc obj]]
(conj r
(if (= opc “const”) obj
(str (opcodes opc)
(format “%03d” (labels obj))))))
[]
code)))
(defn load-asm [computer code]
(doseq [[pos word] (map list (range) (translate (:opc C) code))]
(aset (:M computer) pos word))
computer)
; invoke with (load-asm C
)(def C {:M (make-array String 1000) :acc 0 :opc (apply hash-map (mapcat list ["const" "get" "put" "ld" "st" "add" "sub" "jpos" "jz" "j" "halt"] (map #(format "%02d" %) (range))))}) (defn reader [code] (let [re #"^(\w\S*)?\s+(\w+)\s*(\w*)" whitespace #"^(#.*)?$" code-lines (remove #(re-matches whitespace %) (clojure.string/split code #"\n"))] (map #(apply vector (rest (re-find re %))) code-lines))) (defn label-def? [line] (first line)) (defn get-labels [code] (reduce (fn [r [[label opc arg] pos]] (assoc r label pos)) {"" 0} (filter #(label-def? (first %)) (map list code (range))))) (defn translate [opcodes source-code] (let [code (reader source-code) labels (get-labels code)] (reduce (fn [r [_ opc obj]] (conj r (if (= opc "const") obj (str (opcodes opc) (format "%03d" (labels obj)))))) [] code))) (defn load-asm [computer code] (doseq [[pos word] (map list (range) (translate (:opc C) code))] (aset (:M computer) pos word)) computer) ; invoke with (load-asm C <code>)PS: sorry for the previous comment