itokens.icn - source code

Source file itokens.icn
############################################################################
#
#	File:     itokens.icn
#
#	Subject:  Procedures for tokenizing Icon code
#
#	Author:   Richard L. Goerwitz
#
#	Date:	  March 3, 1996
#
#	Modified: Bruce Rennie
#
#	Date:     August 7, 2020
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
#	Version:  1.12
#
############################################################################
#
#  This file contains itokens() - a utility for breaking Icon source
#  files up into individual tokens.  This is the sort of routine one
#  needs to have around when implementing things like pretty printers,
#  preprocessors, code obfuscators, etc.  It would also be useful for
#  implementing cut-down implementations of Icon written in Icon - the
#  sort of thing one might use in an interactive tutorial.
#
#  Itokens(f, x) takes, as its first argument, f, an open file, and
#  suspends successive TOK records.  TOK records contain two fields.
#  The first field, sym, contains a string that represents the name of
#  the next token (e.g. "CSET", "STRING", etc.).  The second field,
#  str, gives that token's literal value.  E.g. the TOK for a literal
#  semicolon is TOK("SEMICOL", ";").  For a mandatory newline, itokens
#  would suspend TOK("SEMICOL", "\n").
#
#  Unlike Icon's own tokenizer, itokens() does not return an EOFX
#  token on end-of-file, but rather simply fails.  It also can be
#  instructed to return syntactically meaningless newlines by passing
#  it a nonnull second argument (e.g. itokens(infile, 1)).  These
#  meaningless newlines are returned as TOK records with a null sym
#  field (i.e. TOK(&null, "\n")).
#
#  NOTE WELL: If new reserved words or operators are added to a given
#  implementation, the tables below will have to be altered.  Note
#  also that &keywords should be implemented on the syntactic level -
#  not on the lexical one.  As a result, a keyword like &features will
#  be suspended as TOK("CONJUNC", "&") and TOK("IDENT", "features").
#
#  Updates to this file are to bring it to the current Unicon 13 Language
#
############################################################################
#
#  Links:  scan
#
############################################################################
#
#  Requires: coexpressions
#
############################################################################

link scan

global next_c, line_number
record TOK(sym, str)

#
# main:  an Icon source code uglifier
#
#     Stub main for testing; uncomment & compile.  The resulting
#     executable will act as an Icon file compressor, taking the
#     standard input and outputting Icon code stripped of all
#     unnecessary whitespace.  Guaranteed to make the code a visual
#     mess :-).
#
#procedure main()
#
#    local separator, T
#    separator := ""
#    every T := itokens(&input) do {
#	if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT"
#	then writes(separator)
#	if T.sym == "SEMICOL" then writes(";") else writes(T.str)
#	if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT"
#	then separator := " " else separator := ""
#    }
#
#end


#
# itokens:  file x anything    -> TOK records (a generator)
#           (stream, nostrip)  -> Rs
#
#     Where stream is an open file, anything is any object (it only
#     matters whether it is null or not), and Rs are TOK records.
#     Note that itokens strips out useless newlines.  If the second
#     argument is nonnull, itokens does not strip out superfluous
#     newlines.  It may be useful to keep them when the original line
#     structure of the input file must be maintained.
#
procedure itokens(stream, nostrip)

    local T, last_token

    # initialize to some meaningless value
    last_token := TOK()

    every T := \iparse_tokens(stream) do {
	if \T.sym then {
	    if T.sym == "EOFX" then fail
	    else {
		#
		# If the last token was a semicolon, then interpret
		# all ambiguously unary/binary sequences like "**" as
		# beginners (** could be two unary stars or the [c]set
		# intersection operator).
		#
		if \last_token.sym == "SEMICOL"
		then suspend last_token := expand_fake_beginner(T)
		else suspend last_token := T
	    }
	} else {
	    if \nostrip
	    then suspend last_token := T
	}
    }

end


#
# expand_fake_beginner: TOK record -> TOK records
#
#     Some "beginner" tokens aren't really beginners.  They are token
#     sequences that could be either a single binary operator or a
#     series of unary operators.  The tokenizer's job is just to snap
#     up as many characters as could logically constitute an operator.
#     Here is where we decide whether to break the sequence up into
#     more than one op or not.
#
procedure expand_fake_beginner(next_token)

    static exptbl
    initial {
	exptbl := table()
	insert(exptbl, "CONCAT",  [TOK("BAR", "|"),   TOK("BAR", "|")])
	insert(exptbl, "DIFF",    [TOK("MINUS", "-"), TOK("MINUS", "-")])
 	insert(exptbl, "EQUIV",   [TOK("NUMEQ", "="), TOK("NUMEQ", "="),
				   TOK("NUMEQ", "=")])
	insert(exptbl, "INTER",   [TOK("STAR", "*"),  TOK("STAR", "*")])
	insert(exptbl, "LCONCAT", [TOK("BAR", "|"),   TOK("BAR", "|"),
				   TOK("BAR", "|")])
	insert(exptbl, "LEXEQ",   [TOK("NUMEQ", "="), TOK("NUMEQ", "=")])
	insert(exptbl, "LEXNE",   [TOK("TILDE", "~"), TOK("NUMEQ", "="),
				   TOK("NUMEQ", "=")])
	insert(exptbl, "NOTEQUIV",[TOK("TILDE", "~"), TOK("NUMEQ","="),
				   TOK("NUMEQ", "="), TOK("NUMEQ", "=")])
	insert(exptbl, "NUMNE",   [TOK("TILDE", "~"), TOK("NUMEQ","=")])
	insert(exptbl, "PMATCH",  [TOK("QMARK", "?"), TOK("QMARK", "?")])
	insert(exptbl, "UNION",   [TOK("PLUS", "+"),  TOK("PLUS", "+")])
    }

    if \exptbl[next_token.sym]
    then suspend !exptbl[next_token.sym]
    else return next_token

end


#
# iparse_tokens:  file     -> TOK records (a generator)
#                 (stream) -> tokens
#
#     Where file is an open input stream, and tokens are TOK records
#     holding both the token type and actual token text.
#
#     TOK records contain two parts, a preterminal symbol (the first
#     "sym" field), and the actual text of the token ("str").  The
#     parser only pays attention to the sym field, although the
#     strings themselves get pushed onto the value stack.
#
#     Note the following kludge:  Unlike real Icon tokenizers, this
#     procedure returns syntactially meaningless newlines as TOK
#     records with a null sym field.  Normally they would be ignored.
#     I wanted to return them so they could be printed on the output
#     stream, thus preserving the line structure of the original
#     file, and making later diagnostic messages more usable.
#
#     Changes to this procedure include adding all addition reserved
#     words that have entered the Unicon language, adding each of the
#     additional operators that relate to message passing and pattern
#     mathcing.
#
procedure iparse_tokens(stream, getchar)

    local elem, whitespace, token, last_token, primitives, reserveds
    static be_tbl, reserved_tbl, operators
    initial {

	#  Primitive Tokens
	#
	primitives := [
		       ["identifier",      "IDENT",     "be"],
		       ["integer-literal", "INTLIT",    "be"],
		       ["real-literal",    "REALLIT",   "be"],
		       ["string-literal",  "STRINGLIT", "be"],
		       ["cset-literal",    "CSETLIT",   "be"],
		       ["end-of-file",     "EOFX",      "" ]]

	# Reserved Words
	#
	reserveds  := [
		       ["abstract",        "ABSTRACT",  ""  ],
		       ["break",           "BREAK",     "be"],
		       ["by",              "BY",        ""  ],
		       ["case",            "CASE",      "b" ],
		       ["class",           "CLASS",     ""  ],
		       ["create",          "CREATE",    "b" ],
		       ["critical",        "CRITICAL",  "b" ],
		       ["default",         "DEFAULT",   "b" ],
		       ["do",              "DO",        ""  ],
                       ["else",            "ELSE",      ""  ],
		       ["end",             "END",       "b" ],
		       ["every",           "EVERY",     "b" ],
		       ["fail",            "FAIL",      "be"],
		       ["global",          "GLOBAL",    ""  ],
		       ["if",              "IF",        "b" ],
		       ["import",          "IMPORT",    "b" ],
		       ["initial",         "INITIAL",   "b" ],
		       ["initially",       "INITIALLY", "be"],
		       ["invocable",       "INVOCABLE", ""  ],
		       ["link",            "LINK",      ""  ],
		       ["local",           "LOCAL",     "b" ],
		       ["method",          "METHOD",    ""  ],
		       ["next",            "NEXT",      "be"],
		       ["not",             "NOT",       "b" ],
		       ["of",              "OF",        ""  ],
		       ["package",         "PACKAGE",   ""  ],
		       ["procedure",       "PROCEDURE", ""  ],
		       ["record",          "RECORD",    ""  ],
		       ["repeat",          "REPEAT",    "b" ],
		       ["return",          "RETURN",    "be"],
		       ["static",          "STATIC",    "b" ],
		       ["suspend",         "SUSPEND",   "be"],
		       ["then",            "THEN",      ""  ],
		       ["thread",          "THREAD",    "b" ],
		       ["to",              "TO",        ""  ],
		       ["until",           "UNTIL",     "b" ],
		       ["while",           "WHILE",     "b" ]]

	# Operators
	#
	operators  := [
		       [":=",              "ASSIGN",    ""  ],
		       ["@",               "AT",        "b" ],
		       ["@:=",             "AUGACT",    ""  ],
		       ["&:=",             "AUGAND",    ""  ],
		       ["=:=",             "AUGEQ",     ""  ],
		       ["===:=",           "AUGEQV",    ""  ],
		       [">=:=",            "AUGGE",     ""  ],
		       [">:=",             "AUGGT",     ""  ],
		       ["<=:=",            "AUGLE",     ""  ],
		       ["<:=",             "AUGLT",     ""  ],
		       ["~=:=",            "AUGNE",     ""  ],
		       ["~===:=",          "AUGNEQV",   ""  ],
		       ["==:=",            "AUGSEQ",    ""  ],
		       [">>=:=",           "AUGSGE",    ""  ],
		       [">>:=",            "AUGSGT",    ""  ],
		       ["<<=:=",           "AUGSLE",    ""  ],
		       ["<<:=",            "AUGSLT",    ""  ],
		       ["~==:=",           "AUGSNE",    ""  ],
		       ["\\",              "BACKSLASH", "b" ],
		       ["!",               "BANG",      "b" ],
		       ["|",               "BAR",       "b" ],
		       ["@<<",             "BLKRECM",   "be"],
		       ["@>>",             "BLKSENDM",  "be"],
		       ["^",               "CARET",     "b" ],
		       ["^:=",             "CARETASGN", "b" ],
		       [":",               "COLON",     ""  ],
		       ["::",              "COLONCOLON","b" ],
		       [",",               "COMMA",     ""  ],
		       ["->",              "CONDASGN",  ""  ],
		       ["||",              "CONCAT",    "b" ],
                       ["||:=",            "CONCATASGN",""  ],
		       ["&",               "CONJUNC",   "b" ],
		       [".>",              "CURASGN",   "b"  ],
		       [".",               "DOT",       "b" ],
		       ["--",              "DIFF",      "b" ],
		       ["--:=",            "DIFFASGN",  ""  ],
		       ["===",             "EQUIV",     "b" ],
		       ["=>",              "IMMASGN",  ""  ],
		       ["**",              "INTER",     "b" ],
		       ["**:=",            "INTERASGN", ""  ],
		       ["{",               "LBRACE",    "b" ],
		       ["[",               "LBRACK",    "b" ],
		       ["|||",             "LCONCAT",   "b" ],
		       ["|||:=",           "LCONCATASGN","" ],
		       ["==",              "LEXEQ",     "b" ],
		       [">>=",             "LEXGE",     ""  ],
		       [">>",              "LEXGT",     ""  ],
		       ["<<=",             "LEXLE",     ""  ],
		       ["<<",              "LEXLT",     ""  ],
		       ["~==",             "LEXNE",     "b" ],
		       ["[:",              "LLISTCOMP", "b" ],
		       ["(",               "LPAREN",    "b" ],
		       ["-:",              "MCOLON",    ""  ],
		       ["-",               "MINUS",     "b" ],
		       ["-:=",             "MINUSASGN", ""  ],
		       ["%",               "MOD",       ""  ],
		       ["%:=",             "MODASGN",   ""  ],
		       ["~===",            "NOTEQUIV",  "b" ],
		       ["=",               "NUMEQ",     "b" ],
		       [">=",              "NUMGE",     ""  ],
		       [">",               "NUMGT",     ""  ],
		       ["<=",              "NUMLE",     ""  ],
		       ["<",               "NUMLT",     ""  ],
		       ["~=",              "NUMNE",     "b" ],
		       [".|",              "PATTALT",   ""  ],
		       ["+:",              "PCOLON",    ""  ],
		       ["+",               "PLUS",      "b" ],
		       ["+:=",             "PLUSASGN",  ""  ],
		       ["??",              "PMATCH",    "b" ],
		       ["?",               "QMARK",     "b" ],
		       ["<-",              "REVASSIGN", ""  ],
		       ["<->",             "REVSWAP",   ""  ],
		       ["}",               "RBRACE",    "e" ],
		       ["]",               "RBRACK",    "e" ],
		       ["@<",              "RECM",      "be"],
		       [":]",              "RLISTCOMP", "e" ],
		       [")",               "RPAREN",    "e" ],
		       ["?:=",             "SCANASGN",  ""  ],
		       [";",               "SEMICOL",   ""  ],
		       ["@>",              "SENDM",     "be"],
		       ["/",               "SLASH",     "b" ],
		       ["/:=",             "SLASHASGN", ""  ],
		       ["*",               "STAR",      "b" ],
		       ["*:=",             "STARASGN",  ""  ],
		       [":=:",             "SWAP",      ""  ],
		       ["~",               "TILDE",     "b" ],
		       ["++",              "UNION",     "b" ],
		       ["++:=",            "UNIONASGN", ""  ],
		       ["$(",              "LBRACE",    "b" ],
		       ["$)",              "RBRACE",    "e" ],
		       ["$<",              "LBRACK",    "b" ],
		       ["$>",              "RBRACK",    "e" ],
		       ["$",               "RHSARG",    "b" ],
		       ["%$(",             "BEGGLOB",   "b" ],
		       ["%$)",             "ENDGLOB",   "e" ],
		       ["%{",              "BEGGLOB",   "b" ],
		       ["%}",              "ENDGLOB",   "e" ],
		       ["%%",              "NEWSECT",   "be"]]

	# static be_tbl, reserved_tbl
	reserved_tbl := table()
	every elem := !reserveds do
	    insert(reserved_tbl, elem[1], elem[2])
	be_tbl := table()
	every elem := !primitives | !reserveds | !operators do {
	    insert(be_tbl, elem[2], elem[3])
	}
    }

    /getchar   := create {
	line_number := 0
	! ( 1(!stream, line_number +:=1) || "\n" )
    }
    whitespace := ' \t'
    /next_c    := @getchar | {
	if \stream then
	    return TOK("EOFX")
	else fail
    }

    repeat {
	case next_c of {


	    "\n"     : {
		# If do_newline fails, it means we're at the end of
		# the input stream, and we should break out of the
		# repeat loop.
		#
		every last_token := do_newline(getchar, last_token, be_tbl)
		do suspend last_token
		if next_c === &null then break
		next
	    }

	    "\#"     : {
		# Just a comment.  Strip it by reading every character
		# up to the next newline.  The global var next_c
		# should *always* == "\n" when this is done.
		#
		do_number_sign(getchar)
#		write(&errout, "next_c == ", image(next_c))
		next
	    }

	    "\""    : {
		# Suspend as STRINGLIT everything from here up to the
		# next non-backslashed quotation mark, inclusive
		# (accounting for the _ line-continuation convention).
		#
		last_token := do_quotation_mark(getchar)
		suspend last_token
#		write(&errout, "next_c == ", image(next_c))
		next
	    }

	    "'"     : {
		# Suspend as CSETLIT everything from here up to the
		# next non-backslashed apostrophe, inclusive.
		#
		last_token := do_apostrophe(getchar)
		suspend last_token
#		write(&errout, "next_c == ", image(next_c))
		next
	    }

	    &null   : stop("iparse_tokens (lexer):  unexpected EOF")

	    default : {
		# If we get to here, we have either whitespace, an
		# integer or real literal, an identifier or reserved
		# word (both get handled by do_identifier), or an
		# operator.  The question of which we have can be
		# determined by checking the first character.
		#
		if any(whitespace, next_c) then {
		    # Like all of the TOK forming procedures,
		    # do_whitespace resets next_c.
		    do_whitespace(getchar, whitespace)
		    # don't suspend any tokens
		    next
		}
		if any(&digits, next_c) then {
		    last_token := do_digits(getchar)
		    suspend last_token
		    next
		}
		if any(&letters ++ '_', next_c) then {
		    last_token := do_identifier(getchar, reserved_tbl)
		    suspend last_token
		    next
		}
#		write(&errout, "it's an operator or a real number")
		last_token := do_operator(getchar, operators)
		suspend last_token
		next
	    }
	}
    }

    # If stream argument is nonnull, then we are in the top-level
    # iparse_tokens().  If not, then we are in a recursive call, and
    # we should not emit all this end-of-file crap.
    #
    if \stream then {
	return TOK("EOFX")
    }
    else fail

end


#
#  do_dot:  coexpression -> TOK record
#           getchar      -> t
#
#      Where getchar is the coexpression that produces the next
#      character from the input stream and t is a token record whose
#      sym field contains either "REALLIT" or "DOT".  Essentially,
#      do_dot checks the next char on the input stream to see if it's
#      an integer.  Since the preceding char was a dot, an integer
#      tips us off that we have a real literal.  Otherwise, it's just
#      a dot operator.  Note that do_dot resets next_c for the next
#      cycle through the main case loop in the calling procedure.
#
procedure do_dot(getchar)

    local token
    # global next_c

#    write(&errout, "it's a dot")

    # If dot's followed by a digit, then we have a real literal.
    #
    if any(&digits, next_c := @getchar) then {
#	write(&errout, "dot -> it's a real literal")
	token := "." || next_c
	while any(&digits, next_c := @getchar) do
	    token ||:= next_c
	if token ||:= (next_c == ("e"|"E")) then {
	    while (next_c := @getchar) == "0"
	    while any(&digits, next_c) do {
		token ||:= next_c
		next_c = @getchar
	    }
	}
	return TOK("REALLIT", token)
    }

    # Dot not followed by an integer; so we just have a dot operator,
    # and not a real literal.
    #
#    write(&errout, "dot -> just a plain dot")
    return TOK("DOT", ".")

end


#
#  do_newline:  coexpression x TOK record x table -> TOK records
#               (getchar, last_token, be_tbl)     -> Ts (a generator)
#
#      Where getchar is the coexpression that returns the next
#      character from the input stream, last_token is the last TOK
#      record suspended by the calling procedure, be_tbl is a table of
#      tokens and their "beginner/ender" status, and Ts are TOK
#      records.  Note that do_newline resets next_c.  Do_newline is a
#      mess.  What it does is check the last token suspended by the
#      calling procedure to see if it was a beginner or ender.  It
#      then gets the next token by calling iparse_tokens again.  If
#      the next token is a beginner and the last token is an ender,
#      then we have to suspend a SEMICOL token.  In either event, both
#      the last and next token are suspended.
#
procedure do_newline(getchar, last_token, be_tbl)

    local next_token
    # global next_c

#    write(&errout, "it's a newline")

    # Go past any additional newlines.
    #
    while next_c == "\n" do {
        # NL can be the last char in the getchar stream; if it *is*,
	# then signal that it's time to break out of the repeat loop
	# in the calling procedure.
	#
	next_c := @getchar | {
	    next_c := &null
	    fail
	}
	suspend TOK(&null, next_c == "\n")
    }

    # If there was a last token (i.e. if a newline wasn't the first
    # character of significance in the input stream), then check to
    # see if it was an ender.  If so, then check to see if the next
    # token is a beginner.  If so, then suspend a TOK("SEMICOL")
    # record before suspending the next token.
    #
    if find("e", be_tbl[(\last_token).sym]) then {
#	write(&errout, "calling iparse_tokens via do_newline")
#	&trace := -1
	# First arg to iparse_tokens can be null here.
	\ (next_token := iparse_tokens(&null, getchar)).sym
	if \next_token then {
#	    write(&errout, "call of iparse_tokens via do_newline yields ",
#		  ximage(next_token))
	    if find("b", be_tbl[next_token.sym])
	    then suspend TOK("SEMICOL", "\n")
	    #
	    # See below.  If this were like the real Icon parser,
	    # the following line would be commented out.
	    #
	    else suspend TOK(&null, "\n")
	    return next_token
	}
	else {
	    #
	    # If this were a *real* Icon tokenizer, it would not emit
	    # any record here, but would simply fail.  Instead, we'll
	    # emit a dummy record with a null sym field.
	    #
	    return TOK(&null, "\n")
#           &trace := 0
#	    fail
	}
    }

    # See above.  Again, if this were like Icon's own tokenizer, we
    # would just fail here, and not return any TOK record.
    #
#   &trace := 0
    return TOK(&null, "\n")
#   fail

end


#
#  do_number_sign:  coexpression -> &null
#                   getchar      ->
#
#      Where getchar is the coexpression that pops characters off the
#      main input stream.  Sets the global variable next_c.  This
#      procedure simply reads characters until it gets a newline, then
#      returns with next_c == "\n".  Since the starting character was
#      a number sign, this has the effect of stripping comments.
#
procedure do_number_sign(getchar)

    # global next_c

#    write(&errout, "it's a number sign")
    while next_c ~== "\n" do {
	next_c := @getchar
    }

    # Return to calling procedure to cycle around again with the new
    # next_c already set.  Next_c should always be "\n" at this point.
    return

end


#
#  do_quotation_mark:  coexpression -> TOK record
#                      getchar      -> t
#
#      Where getchar is the coexpression that yields another character
#      from the input stream, and t is a TOK record with "STRINGLIT"
#      as its sym field.  Puts everything upto and including the next
#      non-backslashed quotation mark into the str field.  Handles the
#      underscore continuation convention.
#
procedure do_quotation_mark(getchar)

    local token
    # global next_c

    # write(&errout, "it's a string literal")
    token := "\""
    next_c := @getchar
    repeat {
	if next_c == "\n" & token[-1] == "_" then {
	    token := token[1:-1]
	    while any('\t ', next_c := @getchar)
	    next
	} else {
	    if slshupto('"', token ||:= next_c, 2)
	    then {
		next_c := @getchar
		# resume outermost (repeat) loop in calling procedure,
		# with the new (here explicitly set) next_c
		return TOK("STRINGLIT", token)
	    }
	    next_c := @getchar
	}
    }

end


#
#  do_apostrophe:  coexpression -> TOK record
#                  getchar      -> t
#
#      Where getchar is the coexpression that yields another character
#      from the input stream, and t is a TOK record with "CSETLIT"
#      as its sym field.  Puts everything upto and including the next
#      non-backslashed apostrope into the str field.
#
procedure do_apostrophe(getchar)

    local token
    # global next_c

#   write(&errout, "it's a cset literal")
    token := "'"
    next_c := @getchar
    repeat {
	if next_c == "\n" & token[-1] == "_" then {
	    token := token[1:-1]
	    while any('\t ', next_c := @getchar)
	    next
	} else {
	    if slshupto("'", token ||:= next_c, 2)
	    then {
		next_c := @getchar
		# Return & resume outermost containing loop in calling
		# procedure w/ new next_c.
		return TOK("CSETLIT", token)
	    }
	    next_c := @getchar
	}
    }

end


#
#  do_digits:  coexpression -> TOK record
#              getchar      -> t
#
#      Where getchar is the coexpression that produces the next char
#      on the input stream, and where t is a TOK record containing
#      either "REALLIT" or "INTLIT" in its sym field, and the text of
#      the numeric literal in its str field.
#
procedure do_digits(getchar)

    local token, tok_record, extras, digits, over
    # global next_c

    # For bases > 16
    extras := "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz"
    # Assume integer literal until proven otherwise....
    tok_record := TOK("INTLIT")

#   write(&errout, "it's an integer or real literal")
    token := ("0" ~== next_c) | ""
    while any(&digits, next_c := @getchar) do
	token ||:= next_c
    if token ||:= (next_c == ("R"|"r")) then {
	digits := &digits
	if over := ((10 < token[1:-1]) - 10) * 2 then
	    digits ++:= extras[1:over+1] | extras
	next_c := @getchar
	if next_c == "-" then {
	    token ||:= next_c
	    next_c := @getchar
	}
	while any(digits, next_c) do {
	    token ||:= next_c
	    next_c := @getchar
	}
    } else {
	if token ||:= (next_c == ".") then {
	    while any(&digits, next_c := @getchar) do
		token ||:= next_c
	    tok_record := TOK("REALLIT")
	}
	if token ||:= (next_c == ("e"|"E")) then {
	    next_c := @getchar
	    if next_c == "-" then {
		token ||:= next_c
		next_c := @getchar
	    }
	    while any(&digits, next_c) do {
		token ||:= next_c
		next_c := @getchar
	    }
	    tok_record := TOK("REALLIT")
	}
    }
    tok_record.str := ("" ~== token) | "0"
    return tok_record

end


#
#  do_whitespace:  coexpression x cset  -> &null
#                  getchar x whitespace -> &null
#
#      Where getchar is the coexpression producing the next char on
#      the input stream.  Do_whitespace just repeats until it finds a
#      non-whitespace character, whitespace being defined as
#      membership of a given character in the whitespace argument (a
#      cset).
#
procedure do_whitespace(getchar, whitespace)

#   write(&errout, "it's junk")
    while any(whitespace, next_c) do
	next_c := @getchar
    return

end


#
#  do_identifier:  coexpression x table    -> TOK record
#                  (getchar, reserved_tbl) -> t
#
#      Where getchar is the coexpression that pops off characters from
#      the input stream, reserved_tbl is a table of reserved words
#      (keys = the string values, values = the names qua symbols in
#      the grammar), and t is a TOK record containing all subsequent
#      letters, digits, or underscores after next_c (which must be a
#      letter or underscore).  Note that next_c is global and gets
#      reset by do_identifier.
#
procedure do_identifier(getchar, reserved_tbl)

    local token
    # global next_c

#   write(&errout, "it's an indentifier")
    token := next_c
    while any(&letters ++ &digits ++ '_', next_c := @getchar)
    do token ||:= next_c
    return TOK(\reserved_tbl[token], token) | TOK("IDENT", token)

end


#
#  do_operator:  coexpression x list  -> TOK record
#                (getchar, operators) -> t
#
#     Where getchar is the coexpression that produces the next
#     character on the input stream, operators is the operator list,
#     and where t is a TOK record describing the operator just
#     scanned.  Calls recognop, which creates a DFSA to recognize
#     valid Icon operators.  Arg2 (operators) is the list of lists
#     containing valid Icon operator string values and names (see
#     above).
#
procedure do_operator(getchar, operators)

    local token, elem, dot_found

    token := next_c

    if token == "." then  {
	# Could be a real literal *or* a dot based operator.  Check
	# following character to see if it's a digit.  If so,
	# it's a real literal.
	#
	last_token := do_dot(getchar)
	if last_token.sym == "REALLIT" then {
	    return last_token
#	    write(&errout, "next_c == ", image(next_c))
	}
	#
	dot_found := 1
    }
    # Go until recognop fails.
    while elem := recognop(operators, token, 1) do {
	if \dot_found then {
	    dot_found := &null
	    token ||:= next_c
	} else {
	    token ||:= (next_c := @getchar)
	}
    }
#   write(&errout, ximage(elem))
    if *\elem = 1 then
	return TOK(elem[1][2], elem[1][1])
    else fail

end


record dfstn_state(b, e, tbl)
record start_state(b, e, tbl, master_list)
#
#  recognop: list x string x integer -> list
#            (l, s, i)               -> l2
#
#     Where l is the list of lists created by the calling procedure
#     (each element contains a token string value, name, and
#     beginner/ender string), where s is a string possibly
#     corresponding to a token in the list, where i is the position in
#     the elements of l where the operator string values are recorded,
#     and where l2 is a list of elements from l that contain operators
#     for which string s is an exact match.  Fails if there are no
#     operators that s is a prefix of, but returns an empty list if
#     there just aren't any that happen to match exactly.
#
#      What this does is let the calling procedure just keep adding
#      characters to s until recognop fails, then check the last list
#      it returned to see if it is of length 1.  If it is, then it
#      contains list with the vital stats for the operator last
#      recognized.  If it is of length 0, then string s did not
#      contain any recognizable operator.
#
procedure recognop(l, s, i)

    local   current_state, master_list, c, result, j
    static  dfstn_table
    initial dfstn_table := table()

    /i := 1
    # See if we've created an automaton for l already.
    /dfstn_table[l] := start_state(1, *l, &null, &null) & {
	dfstn_table[l].master_list := sortf(l, i)
    }

    current_state := dfstn_table[l]
    # Save master_list, as current_state will change later on.
    master_list   := current_state.master_list

    s ? {
	while c := move(1) do {

	    # Null means that this part of the automaton isn't
	    # complete.
	    #
	    if /current_state.tbl then
		create_arcs(master_list, i, current_state, &pos)

	    # If the table has been clobbered, then there are no arcs
	    # leading out of the current state.  Fail.
	    #
	    if current_state.tbl === 0 then
		fail

#	    write(&errout, "c = ", image(c))
#	    write(&errout, "table for current state = ",
#		  ximage(current_state.tbl))

	    # If we get to here, the current state has arcs leading
	    # out of it.  See if c is one of them.  If so, make the
	    # node to which arc c is connected the current state.
	    # Otherwise fail.
	    #
	    current_state := \current_state.tbl[c] | fail
	}
    }

    # Return possible completions.
    #
    result := list()
    every j := current_state.b to current_state.e do {
	if *master_list[j][i] = *s then
	    put(result, master_list[j])
    }
    # return empty list if nothing the right length is found
    return result

end


#
#  create_arcs:  fill out a table of arcs leading out of the current
#                state, and place that table in the tbl field for
#                current_state
#
procedure create_arcs(master_list, field, current_state, POS)

    local elem, i, first_char, old_first_char

    current_state.tbl := table()
    old_first_char := ""

    every elem := master_list[i := current_state.b to current_state.e][field]
    do {

	# Get the first character for the current position (note that
	# we're one character behind the calling routine; hence
	# POS-1).
	#
	first_char := elem[POS-1] | next

	# If we have a new first character, create a new arc out of
	# the current state.
	#
	if first_char ~== old_first_char then {
	    # Store the start position for the current character.
	    current_state.tbl[first_char] := dfstn_state(i)
	    # Store the end position for the old character.
	    (\current_state.tbl[old_first_char]).e := i-1
	    old_first_char := first_char
	}
    }
    (\current_state.tbl[old_first_char]).e := i

    # Clobber table with 0 if no arcs were added.
    current_state.tbl := (*current_state.tbl = 0)
    return current_state

end
This page produced by UniDoc on 2021/04/15 @ 23:59:45.