[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

A portable read implementation

This page is part of the web mail archives of SRFI 38 from before July 7th, 2015. The new archives for SRFI 38 contain all messages, not just those from before July 7th, 2015.



Below are portable implementations of a read and write that use the
srfi-38 notation.

Some nitty gritty:

* The composition of characters into tokens and of tokens into data
  should be explicitly described in BNF and in English.  Points
  to cover:
    -- The #nn# and #nn= tokens are self-terminating
    -- Leading zeros allowed?  #007#
    -- Multiple labels allowed?  #1=#2=(#1# #2#)
    -- Labeled references allowed?  #1=(#2=#1# #2#)
    -- Require that #1=#1# signal an error rather than cause an infinite loop.

* Mention explicitly that numbers do not survive across reads and
  writes.

* May the numbers start at any number?  Must they be used
  sequentially?  May they be reused?  May they be used before they are
  declared, (#1# #1=foo)?  I think all these answers should be No.

The implementations below are not heavily tested.
Read-with-numbered-parts should be able to handle any correct r5rs
datum.  It signals some errors but allows several non-standard things
as well, like (. x) and #(x . (y)).

-al

(define (read-with-numbered-parts . optional-port)
  (define port
    (if (null? optional-port) (current-input-port) (car optional-port)))

  (define (read-char*) (read-char port))
  (define (peek-char*) (peek-char port))

  (define (looking-at? c)
    (eqv? c (peek-char*)))

  (define (delimiter? c)
    (case c
      ((#\( #\) #\" #\;) #t)
      (else (or (eof-object? c)
		(char-whitespace? c)))))

  (define (not-delimiter? c) (not (delimiter? c)))

  (define (eat-intertoken-space)
    (define c (peek-char*))
    (cond ((eof-object? c))
	  ((char-whitespace? c) (read-char*) (eat-intertoken-space))
	  ((char=? c #\;)
	   (do ((c (read-char*) (read-char*)))
	       ((or (eof-object? c) (char=? c #\newline))))
	   (eat-intertoken-space))))

  (define (read-string)
    (read-char*)
    (let read-it ((chars '()))
      (let ((c (read-char*)))
	(case c
	  ((#\") (list->string (reverse chars)))
	  ((#\\) (read-it (cons (read-char*) chars)))
	  (else (read-it (cons c chars)))))))

  ;; reads chars that match PRED and returns them as a string.
  (define (read-some-chars pred)
    (let iter ((chars '()))
      (let ((c (peek-char*)))
	(if (or (eof-object? c) (not (pred c)))
	    (list->string (reverse chars))
	    (iter (cons (read-char*) chars))))))

  ;; reads a character after the #\ has been read.
  (define (read-character)
    (let ((c (peek-char*)))
      (if (char-alphabetic? c)
	  (let ((name (read-some-chars char-alphabetic?)))
	    (cond ((= 1 (string-length name)) (string-ref name 0))
		  ((string-ci=? name "space") #\space)
		  ((string-ci=? name "newline") #\newline)
		  (else (error "Unknown named character: " name))))
	  (read-char*))))

  (define (read-number first-char)
    (let ((str (string-append (string first-char)
			      (read-some-chars not-delimiter?))))
      (or (string->number str)
	  (error "Malformed number: " str))))

  (define char-standard-case
    (if (char=? #\a (string-ref (symbol->string 'a) 0))
	char-downcase
	char-upcase))

  (define (string-standard-case str)
    (let* ((len (string-length str))
	   (new (make-string len)))
      (do ((i 0 (+ i 1)))
	  ((= i len) new)
	(string-set! new i (char-standard-case (string-ref str i))))))

  (define (read-identifier)
    (string->symbol (string-standard-case (read-some-chars not-delimiter?))))

  (define (read-part-spec)
    (let ((n (string->number (read-some-chars char-numeric?))))
      (let ((c (read-char*)))
	(case c
	  ((#\=) (cons 'label n))
	  ((#\#) (cons 'use n))
	  (else (error "Malformed shared part specifier"))))))

  ;; Tokens: strings, characters, numbers, booleans, and
  ;; identifiers/symbols are represented as themselves.
  ;; Single-character tokens are represented as (CHAR), the
  ;; two-character tokens #( and ,@ become (#\#) and (#\@).
  ;; #NN= and #NN# become (label NN) and (use NN).
  (define (read-token)
    (eat-intertoken-space)
    (let ((c (peek-char*)))
      (case c
	((#\( #\) #\' #\`) (read-char*) (list c))
	((#\,)
	 (read-char*)
	 (if (looking-at? #\@)
	     (begin (read-char*) '(#\@))
	     '(#\,)))
	((#\") (read-string))
	((#\.)
	 (read-char*)
	 (cond ((delimiter? (peek-char*)) '(#\.))
	       ((not (looking-at? #\.)) (read-number #\.))
	       ((begin (read-char*) (looking-at? #\.)) (read-char*) '...)
	       (else (error "Malformed token starting with \"..\""))))
	((#\+) (read-char*) (if (delimiter? (peek-char*)) '+ (read-number c)))
	((#\-) (read-char*) (if (delimiter? (peek-char*)) '- (read-number c)))
	((#\#)
	 (read-char*)
	 (let ((c (peek-char*)))
	   (case c
	     ((#\() (read-char*) '(#\#))
	     ((#\\) (read-char*) (read-character))
	     ((#\t #\T) (read-char*) #t)
	     ((#\f #\F) (read-char*) #f)
	     (else (cond ((char-numeric? c) (read-part-spec))
			 (else (read-number #\#)))))))
	(else (cond ((eof-object? c) c)
		    ((char-numeric? c) (read-char*) (read-number c))
		    (else (read-identifier)))))))

  ;; Maps the number of each part to a thunk that returns the part.
  (define parts-alist '())

  (define (read-object)
    (finish-reading-object (read-token)))

  (define (finish-reading-object first-token)
    (if (not (pair? first-token))
	first-token
	(case (car first-token)
	  ((#\() (read-tail))
	  ((#\#) (list->vector (read-tail)))
	  ((#\. #\)) (error "Unexpected \"" token "\""))
	  ((use) (let ((n (cdr first-token)))
		   (cond ((assv n parts-alist) => cdr)
			 (else (error "Use of undeclared part " n)))))
	  ((label)
	   ;; This is complicated in order to allow #1=#2=() and
	   ;; #1=(#2=#1#) and not to loop forever when given #1=#1#.
	   (let ((n (cdr first-token)))
	     (if (assv n parts-alist)
		 (error "Double declaration of part " n)
		 (let read-labels ((labels (list n)))
		   (define (add-labels-to-alist! thunk)
		     (set! parts-alist
			   (append (map (lambda (n) (cons n thunk)) labels)
				   parts-alist)))
		   (let ((token (read-token)))
		     (cond ((and (pair? token) (eq? 'label (car token)))
			    ;; An additional label in a chain of them.
			    (let ((n (cdr token)))
			      (if (or (assv n parts-alist) (memv n labels))
				  (error "Double declaration of part " n))
			      (read-labels (cons n labels))))
			   ((and (pair? token) (eq? 'use (car token)))
			    ;; The labeled object is a use of a previous label:
			    ;; reuse its thunk for our thunk.
			    (let* ((n (cdr token)) (p (assv n parts-alist)))
			      (if (not p)
				  (error "Use of undeclared part " n)
				  (let ((thunk (cdr p)))
				    (add-labels-to-alist! thunk)
				    thunk))))
			   (else
			    ;; Normal case.  Make a thunk (which doesn't
			    ;; need to be usable until the read is complete.)
			    (letrec
				((obj (begin
					(add-labels-to-alist! (lambda () obj))
					(finish-reading-object token))))
			      obj))))))))
	  (else (list (caadr (assv (car first-token)
				   '((#\' 'x) (#\, ,x) (#\` `x) (#\@ ,@x))))
		      (read-object))))))

  (define (read-tail)
    (let ((token (read-token)))
      (cond ((eof-object? token) (error "EOF inside a list or vector"))
	    ((not (pair? token)) (cons token (read-tail)))
	    (else (case (car token)
		    ((#\)) '())
		    ((#\.) (let* ((obj (read-object))
				  (tok (read-token)))
			     (if (and (pair? tok) (char=? #\) (car tok)))
				 obj
				 (error "Extra junk after a dot"))))
		    (else (let ((obj (finish-reading-object token)))
			    (cons obj (read-tail)))))))))

  (let ((obj (read-object)))
    (let fill-in-parts ((obj obj))
      (cond ((pair? obj)
	     (if (procedure? (car obj))
		 (set-car! obj ((car obj)))
		 (fill-in-parts (car obj)))
	     (if (procedure? (cdr obj))
		 (set-cdr! obj ((cdr obj)))
		 (fill-in-parts (cdr obj))))
	    ((vector? obj)
	     (let ((len (vector-length obj)))
	       (do ((i 0 (+ i 1)))
		   ((= i len))
		 (let ((elt (vector-ref obj i)))
		   (if (procedure? elt)
		       (vector-set! obj i (elt))
		       (fill-in-parts elt))))))))
    obj))


(define (write-with-numbered-parts obj . optional-port)
  (define port
    (if (null? optional-port) (current-output-port) (car optional-port)))

  (define (write* obj) (write obj port))
  (define (display* obj) (display obj port))
  
  (define (acons key val alist)
    (cons (cons key val) alist))
  ;; We only track duplicates of pairs, vectors, and strings.  We
  ;; ignore zero-length vectors and strings because r5rs doesn't
  ;; guarantee that eq? treats them sanely (and they aren't very
  ;; interesting anyway).
  (define (interesting? obj)
    (or (pair? obj)
	(and (vector? obj) (not (zero? (vector-length obj))))
	(and (string? obj) (not (zero? (string-length obj))))))

  ;; (write-obj OBJ ALIST):
  ;; ALIST has an entry for each interesting part of OBJ.  The
  ;; associated value will be:
  ;;  -- a number if the part has been given one,
  ;;  -- #t if the part will need to be assigned a number but has not been yet,
  ;;  -- #f if the part will not need a number.
  ;; The cdr of ALIST's first element should be the most recently
  ;; assigned number.
  ;; Returns an alist with new shadowing entries for any parts that
  ;; had numbers assigned.
  (define (write-obj obj alist)
    (define (write-interesting alist)
      (cond ((pair? obj)
	     (let ((caro (car obj)) (cdro (cdr obj)))
	       (cond ((and (pair? cdro)
			   (null? (cdr cdro))
			   (let ((abbrev (assq caro '('"'" `"`" ,"," ,@",@"))))
			     (and abbrev
				  ;; we can't abbreviate (quote . #1#)
				  (not (cdr (assq cdro alist)))
				  abbrev)))
		      => (lambda (abbrev)
			   (display* (cadr abbrev))
			   (write-obj (car cdro) alist)))
		     (else (display* "(")
			   (let write-cdr ((obj cdro)
					   (alist (write-obj caro alist)))
			     (cond ((and (pair? obj)
					 (not (cdr (assq obj alist))))
				    (display* " ")
				    (write-cdr (cdr obj)
					       (write-obj (car obj) alist)))
				   ((null? obj) (display* ")") alist)
				   (else (display* " . ")
					 (let ((alist (write-obj obj alist)))
					   (display* ")")
					   alist))))))))
	    ((vector? obj)
	     (display* "#(")
	     (let ((len (vector-length obj)))
	       (do ((i 1 (+ i 1))
		    (alist (write-obj (vector-ref obj 0) alist)
			   (write-obj (vector-ref obj i) alist)))
		   ((= i len) (display* ")") alist)
		 (display* " "))))
	    ;; else it's a string
	    (else (write* obj) alist)))

    (cond ((interesting? obj)
	   (let ((val (cdr (assq obj alist))))
	     (cond ((not val) (write-interesting alist))
		   ((number? val)
		    (display* "#") (write* val) (display* "#") alist)
		   (else
		    (let ((n (+ 1 (cdar alist))))
		      (display* "#") (write* n) (display* "=")
		      (write-interesting (acons obj n alist)))))))
	  (else (write* obj) alist)))

  ;; Scan computes the initial value of the alist, which maps each
  ;; interesting part of the object to #t if it occurs multiple times,
  ;; #f if only once.
  (define (scan obj alist)
    (cond ((not (interesting? obj)) alist)
	  ((assq obj alist) =>
	   (lambda (p) (if (cdr p) alist (acons obj #t alist))))
	  (else
	   (let ((alist (acons obj #f alist)))
	     (cond ((pair? obj) (scan (car obj) (scan (cdr obj) alist)))
		   ((vector? obj)
		    (let ((len (vector-length obj)))
		      (do ((i 0 (+ 1 i))
			   (alist alist (scan (vector-ref obj i) alist)))
			  ((= i len) alist))))
		   (else alist))))))

  (write-obj obj (acons 'dummy 0 (scan obj '())))
  ;; Don't want to return the big alist that write-obj did, lest it hinder gc.
  (if #f #f))